xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision b3a446650c2c48743e148daeb9ddec8e74bb83a2)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/IR/DiagnosticInfo.h"
25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26 #include "llvm/Support/AtomicOrdering.h"
27 #include "llvm/TargetParser/TargetParser.h"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 #define DEBUG_TYPE "si-memory-legalizer"
33 #define PASS_NAME "SI Memory Legalizer"
34 
35 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37     cl::desc("Use this to skip inserting cache invalidating instructions."));
38 
39 namespace {
40 
41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42 
43 /// Memory operation flags. Can be ORed together.
44 enum class SIMemOp {
45   NONE = 0u,
46   LOAD = 1u << 0,
47   STORE = 1u << 1,
48   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49 };
50 
51 /// Position to insert a new instruction relative to an existing
52 /// instruction.
53 enum class Position {
54   BEFORE,
55   AFTER
56 };
57 
58 /// The atomic synchronization scopes supported by the AMDGPU target.
59 enum class SIAtomicScope {
60   NONE,
61   SINGLETHREAD,
62   WAVEFRONT,
63   WORKGROUP,
64   AGENT,
65   SYSTEM
66 };
67 
68 /// The distinct address spaces supported by the AMDGPU target for
69 /// atomic memory operation. Can be ORed together.
70 enum class SIAtomicAddrSpace {
71   NONE = 0u,
72   GLOBAL = 1u << 0,
73   LDS = 1u << 1,
74   SCRATCH = 1u << 2,
75   GDS = 1u << 3,
76   OTHER = 1u << 4,
77 
78   /// The address spaces that can be accessed by a FLAT instruction.
79   FLAT = GLOBAL | LDS | SCRATCH,
80 
81   /// The address spaces that support atomic instructions.
82   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83 
84   /// All address spaces.
85   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86 
87   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88 };
89 
90 class SIMemOpInfo final {
91 private:
92 
93   friend class SIMemOpAccess;
94 
95   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100   bool IsCrossAddressSpaceOrdering = false;
101   bool IsVolatile = false;
102   bool IsNonTemporal = false;
103   bool IsLastUse = false;
104 
105   SIMemOpInfo(
106       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110       bool IsCrossAddressSpaceOrdering = true,
111       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112       bool IsVolatile = false, bool IsNonTemporal = false,
113       bool IsLastUse = false)
114       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118         IsLastUse(IsLastUse) {
119 
120     if (Ordering == AtomicOrdering::NotAtomic) {
121       assert(Scope == SIAtomicScope::NONE &&
122              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123              !IsCrossAddressSpaceOrdering &&
124              FailureOrdering == AtomicOrdering::NotAtomic);
125       return;
126     }
127 
128     assert(Scope != SIAtomicScope::NONE &&
129            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE &&
131            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132                SIAtomicAddrSpace::NONE);
133 
134     // There is also no cross address space ordering if the ordering
135     // address space is the same as the instruction address space and
136     // only contains a single address space.
137     if ((OrderingAddrSpace == InstrAddrSpace) &&
138         isPowerOf2_32(uint32_t(InstrAddrSpace)))
139       this->IsCrossAddressSpaceOrdering = false;
140 
141     // Limit the scope to the maximum supported by the instruction's address
142     // spaces.
143     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144         SIAtomicAddrSpace::NONE) {
145       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146     } else if ((InstrAddrSpace &
147                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148                SIAtomicAddrSpace::NONE) {
149       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150     } else if ((InstrAddrSpace &
151                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154     }
155   }
156 
157 public:
158   /// \returns Atomic synchronization scope of the machine instruction used to
159   /// create this SIMemOpInfo.
160   SIAtomicScope getScope() const {
161     return Scope;
162   }
163 
164   /// \returns Ordering constraint of the machine instruction used to
165   /// create this SIMemOpInfo.
166   AtomicOrdering getOrdering() const {
167     return Ordering;
168   }
169 
170   /// \returns Failure ordering constraint of the machine instruction used to
171   /// create this SIMemOpInfo.
172   AtomicOrdering getFailureOrdering() const {
173     return FailureOrdering;
174   }
175 
176   /// \returns The address spaces be accessed by the machine
177   /// instruction used to create this SIMemOpInfo.
178   SIAtomicAddrSpace getInstrAddrSpace() const {
179     return InstrAddrSpace;
180   }
181 
182   /// \returns The address spaces that must be ordered by the machine
183   /// instruction used to create this SIMemOpInfo.
184   SIAtomicAddrSpace getOrderingAddrSpace() const {
185     return OrderingAddrSpace;
186   }
187 
188   /// \returns Return true iff memory ordering of operations on
189   /// different address spaces is required.
190   bool getIsCrossAddressSpaceOrdering() const {
191     return IsCrossAddressSpaceOrdering;
192   }
193 
194   /// \returns True if memory access of the machine instruction used to
195   /// create this SIMemOpInfo is volatile, false otherwise.
196   bool isVolatile() const {
197     return IsVolatile;
198   }
199 
200   /// \returns True if memory access of the machine instruction used to
201   /// create this SIMemOpInfo is nontemporal, false otherwise.
202   bool isNonTemporal() const {
203     return IsNonTemporal;
204   }
205 
206   /// \returns True if memory access of the machine instruction used to
207   /// create this SIMemOpInfo is last use, false otherwise.
208   bool isLastUse() const { return IsLastUse; }
209 
210   /// \returns True if ordering constraint of the machine instruction used to
211   /// create this SIMemOpInfo is unordered or higher, false otherwise.
212   bool isAtomic() const {
213     return Ordering != AtomicOrdering::NotAtomic;
214   }
215 
216 };
217 
218 class SIMemOpAccess final {
219 private:
220   AMDGPUMachineModuleInfo *MMI = nullptr;
221 
222   /// Reports unsupported message \p Msg for \p MI to LLVM context.
223   void reportUnsupported(const MachineBasicBlock::iterator &MI,
224                          const char *Msg) const;
225 
226   /// Inspects the target synchronization scope \p SSID and determines
227   /// the SI atomic scope it corresponds to, the address spaces it
228   /// covers, and whether the memory ordering applies between address
229   /// spaces.
230   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232 
233   /// \return Return a bit set of the address spaces accessed by \p AS.
234   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235 
236   /// \returns Info constructed from \p MI, which has at least machine memory
237   /// operand.
238   std::optional<SIMemOpInfo>
239   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240 
241 public:
242   /// Construct class to support accessing the machine memory operands
243   /// of instructions in the machine function \p MF.
244   SIMemOpAccess(MachineFunction &MF);
245 
246   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247   std::optional<SIMemOpInfo>
248   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249 
250   /// \returns Store info if \p MI is a store operation, "std::nullopt"
251   /// otherwise.
252   std::optional<SIMemOpInfo>
253   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254 
255   /// \returns Atomic fence info if \p MI is an atomic fence operation,
256   /// "std::nullopt" otherwise.
257   std::optional<SIMemOpInfo>
258   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261   /// rmw operation, "std::nullopt" otherwise.
262   std::optional<SIMemOpInfo>
263   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264 };
265 
266 class SICacheControl {
267 protected:
268 
269   /// AMDGPU subtarget info.
270   const GCNSubtarget &ST;
271 
272   /// Instruction info.
273   const SIInstrInfo *TII = nullptr;
274 
275   IsaVersion IV;
276 
277   /// Whether to insert cache invalidating instructions.
278   bool InsertCacheInv;
279 
280   SICacheControl(const GCNSubtarget &ST);
281 
282   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283   /// \returns Returns true if \p MI is modified, false otherwise.
284   bool enableNamedBit(const MachineBasicBlock::iterator MI,
285                       AMDGPU::CPol::CPol Bit) const;
286 
287 public:
288 
289   /// Create a cache control for the subtarget \p ST.
290   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291 
292   /// Update \p MI memory load instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296                                      SIAtomicScope Scope,
297                                      SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory store instruction to bypass any caches up to
300   /// the \p Scope memory scope for address spaces \p
301   /// AddrSpace. Return true iff the instruction was modified.
302   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303                                       SIAtomicScope Scope,
304                                       SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory read-modify-write instruction to bypass any caches up
307   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308   /// iff the instruction was modified.
309   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310                                     SIAtomicScope Scope,
311                                     SIAtomicAddrSpace AddrSpace) const = 0;
312 
313   /// Update \p MI memory instruction of kind \p Op associated with address
314   /// spaces \p AddrSpace to indicate it is volatile and/or
315   /// nontemporal/last-use. Return true iff the instruction was modified.
316   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317                                               SIAtomicAddrSpace AddrSpace,
318                                               SIMemOp Op, bool IsVolatile,
319                                               bool IsNonTemporal,
320                                               bool IsLastUse = false) const = 0;
321 
322   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323     return false;
324   };
325 
326   /// Inserts any necessary instructions at position \p Pos relative
327   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328   /// \p Op associated with address spaces \p AddrSpace have completed. Used
329   /// between memory instructions to enforce the order they become visible as
330   /// observed by other memory instructions executing in memory scope \p Scope.
331   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332   /// address spaces. Returns true iff any instructions inserted.
333   virtual bool insertWait(MachineBasicBlock::iterator &MI,
334                           SIAtomicScope Scope,
335                           SIAtomicAddrSpace AddrSpace,
336                           SIMemOp Op,
337                           bool IsCrossAddrSpaceOrdering,
338                           Position Pos) const = 0;
339 
340   /// Inserts any necessary instructions at position \p Pos relative to
341   /// instruction \p MI to ensure any subsequent memory instructions of this
342   /// thread with address spaces \p AddrSpace will observe the previous memory
343   /// operations by any thread for memory scopes up to memory scope \p Scope .
344   /// Returns true iff any instructions inserted.
345   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              Position Pos) const = 0;
349 
350   /// Inserts any necessary instructions at position \p Pos relative to
351   /// instruction \p MI to ensure previous memory instructions by this thread
352   /// with address spaces \p AddrSpace have completed and can be observed by
353   /// subsequent memory instructions by any thread executing in memory scope \p
354   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355   /// between address spaces. Returns true iff any instructions inserted.
356   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357                              SIAtomicScope Scope,
358                              SIAtomicAddrSpace AddrSpace,
359                              bool IsCrossAddrSpaceOrdering,
360                              Position Pos) const = 0;
361 
362   /// Virtual destructor to allow derivations to be deleted.
363   virtual ~SICacheControl() = default;
364 
365   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
366                                    MachineBasicBlock::iterator &MI) const {
367     return false;
368   }
369 };
370 
371 class SIGfx6CacheControl : public SICacheControl {
372 protected:
373 
374   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375   /// is modified, false otherwise.
376   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377     return enableNamedBit(MI, AMDGPU::CPol::GLC);
378   }
379 
380   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381   /// is modified, false otherwise.
382   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383     return enableNamedBit(MI, AMDGPU::CPol::SLC);
384   }
385 
386 public:
387 
388   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
389 
390   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391                              SIAtomicScope Scope,
392                              SIAtomicAddrSpace AddrSpace) const override;
393 
394   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395                               SIAtomicScope Scope,
396                               SIAtomicAddrSpace AddrSpace) const override;
397 
398   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399                             SIAtomicScope Scope,
400                             SIAtomicAddrSpace AddrSpace) const override;
401 
402   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404                                       bool IsVolatile, bool IsNonTemporal,
405                                       bool IsLastUse) const override;
406 
407   bool insertWait(MachineBasicBlock::iterator &MI,
408                   SIAtomicScope Scope,
409                   SIAtomicAddrSpace AddrSpace,
410                   SIMemOp Op,
411                   bool IsCrossAddrSpaceOrdering,
412                   Position Pos) const override;
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419   bool insertRelease(MachineBasicBlock::iterator &MI,
420                      SIAtomicScope Scope,
421                      SIAtomicAddrSpace AddrSpace,
422                      bool IsCrossAddrSpaceOrdering,
423                      Position Pos) const override;
424 };
425 
426 class SIGfx7CacheControl : public SIGfx6CacheControl {
427 public:
428 
429   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
430 
431   bool insertAcquire(MachineBasicBlock::iterator &MI,
432                      SIAtomicScope Scope,
433                      SIAtomicAddrSpace AddrSpace,
434                      Position Pos) const override;
435 
436 };
437 
438 class SIGfx90ACacheControl : public SIGfx7CacheControl {
439 public:
440 
441   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442 
443   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444                              SIAtomicScope Scope,
445                              SIAtomicAddrSpace AddrSpace) const override;
446 
447   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448                               SIAtomicScope Scope,
449                               SIAtomicAddrSpace AddrSpace) const override;
450 
451   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452                             SIAtomicScope Scope,
453                             SIAtomicAddrSpace AddrSpace) const override;
454 
455   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457                                       bool IsVolatile, bool IsNonTemporal,
458                                       bool IsLastUse) const override;
459 
460   bool insertWait(MachineBasicBlock::iterator &MI,
461                   SIAtomicScope Scope,
462                   SIAtomicAddrSpace AddrSpace,
463                   SIMemOp Op,
464                   bool IsCrossAddrSpaceOrdering,
465                   Position Pos) const override;
466 
467   bool insertAcquire(MachineBasicBlock::iterator &MI,
468                      SIAtomicScope Scope,
469                      SIAtomicAddrSpace AddrSpace,
470                      Position Pos) const override;
471 
472   bool insertRelease(MachineBasicBlock::iterator &MI,
473                      SIAtomicScope Scope,
474                      SIAtomicAddrSpace AddrSpace,
475                      bool IsCrossAddrSpaceOrdering,
476                      Position Pos) const override;
477 };
478 
479 class SIGfx940CacheControl : public SIGfx90ACacheControl {
480 protected:
481 
482   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483   /// is modified, false otherwise.
484   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485     return enableNamedBit(MI, AMDGPU::CPol::SC0);
486   }
487 
488   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489   /// is modified, false otherwise.
490   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491     return enableNamedBit(MI, AMDGPU::CPol::SC1);
492   }
493 
494   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495   /// is modified, false otherwise.
496   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497     return enableNamedBit(MI, AMDGPU::CPol::NT);
498   }
499 
500 public:
501 
502   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
503 
504   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505                              SIAtomicScope Scope,
506                              SIAtomicAddrSpace AddrSpace) const override;
507 
508   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509                               SIAtomicScope Scope,
510                               SIAtomicAddrSpace AddrSpace) const override;
511 
512   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513                             SIAtomicScope Scope,
514                             SIAtomicAddrSpace AddrSpace) const override;
515 
516   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518                                       bool IsVolatile, bool IsNonTemporal,
519                                       bool IsLastUse) const override;
520 
521   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523 
524   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526                      Position Pos) const override;
527 
528   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529                            MachineBasicBlock::iterator &MI) const override {
530     bool Changed = false;
531     if (ST.hasForceStoreSC0SC1() &&
532         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
533                                     SIAtomicAddrSpace::GLOBAL |
534                                     SIAtomicAddrSpace::OTHER)) !=
535          SIAtomicAddrSpace::NONE) {
536       Changed |= enableSC0Bit(MI);
537       Changed |= enableSC1Bit(MI);
538     }
539     return Changed;
540   }
541 };
542 
543 class SIGfx10CacheControl : public SIGfx7CacheControl {
544 protected:
545 
546   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547   /// is modified, false otherwise.
548   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549     return enableNamedBit(MI, AMDGPU::CPol::DLC);
550   }
551 
552 public:
553 
554   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
555 
556   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557                              SIAtomicScope Scope,
558                              SIAtomicAddrSpace AddrSpace) const override;
559 
560   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562                                       bool IsVolatile, bool IsNonTemporal,
563                                       bool IsLastUse) const override;
564 
565   bool insertWait(MachineBasicBlock::iterator &MI,
566                   SIAtomicScope Scope,
567                   SIAtomicAddrSpace AddrSpace,
568                   SIMemOp Op,
569                   bool IsCrossAddrSpaceOrdering,
570                   Position Pos) const override;
571 
572   bool insertAcquire(MachineBasicBlock::iterator &MI,
573                      SIAtomicScope Scope,
574                      SIAtomicAddrSpace AddrSpace,
575                      Position Pos) const override;
576 };
577 
578 class SIGfx11CacheControl : public SIGfx10CacheControl {
579 public:
580   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
581 
582   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583                              SIAtomicScope Scope,
584                              SIAtomicAddrSpace AddrSpace) const override;
585 
586   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588                                       bool IsVolatile, bool IsNonTemporal,
589                                       bool IsLastUse) const override;
590 };
591 
592 class SIGfx12CacheControl : public SIGfx11CacheControl {
593 protected:
594   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595   // \returns Returns true if \p MI is modified, false otherwise.
596   bool setTH(const MachineBasicBlock::iterator MI,
597              AMDGPU::CPol::CPol Value) const;
598   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
599   // MI. \returns Returns true if \p MI is modified, false otherwise.
600   bool setScope(const MachineBasicBlock::iterator MI,
601                 AMDGPU::CPol::CPol Value) const;
602 
603   // Stores with system scope (SCOPE_SYS) need to wait for:
604   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605   // - non-returning-atomics       - wait for STORECNT==0
606   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607   //   since it does not distinguish atomics-with-return from regular stores.
608   // There is no need to wait if memory is cached (mtype != UC).
609   bool
610   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611 
612   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614 
615 public:
616   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
617 
618   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
619                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
621 
622   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
623                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
624 
625   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627                                       bool IsVolatile, bool IsNonTemporal,
628                                       bool IsLastUse) const override;
629 
630   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631 
632   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634                      Position Pos) const override;
635 
636   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637                              SIAtomicScope Scope,
638                              SIAtomicAddrSpace AddrSpace) const override {
639     return setAtomicScope(MI, Scope, AddrSpace);
640   }
641 
642   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643                               SIAtomicScope Scope,
644                               SIAtomicAddrSpace AddrSpace) const override {
645     return setAtomicScope(MI, Scope, AddrSpace);
646   }
647 
648   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649                             SIAtomicScope Scope,
650                             SIAtomicAddrSpace AddrSpace) const override {
651     return setAtomicScope(MI, Scope, AddrSpace);
652   }
653 };
654 
655 class SIMemoryLegalizer final : public MachineFunctionPass {
656 private:
657 
658   /// Cache Control.
659   std::unique_ptr<SICacheControl> CC = nullptr;
660 
661   /// List of atomic pseudo instructions.
662   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
663 
664   /// Return true iff instruction \p MI is a atomic instruction that
665   /// returns a result.
666   bool isAtomicRet(const MachineInstr &MI) const {
667     return SIInstrInfo::isAtomicRet(MI);
668   }
669 
670   /// Removes all processed atomic pseudo instructions from the current
671   /// function. Returns true if current function is modified, false otherwise.
672   bool removeAtomicPseudoMIs();
673 
674   /// Expands load operation \p MI. Returns true if instructions are
675   /// added/deleted or \p MI is modified, false otherwise.
676   bool expandLoad(const SIMemOpInfo &MOI,
677                   MachineBasicBlock::iterator &MI);
678   /// Expands store operation \p MI. Returns true if instructions are
679   /// added/deleted or \p MI is modified, false otherwise.
680   bool expandStore(const SIMemOpInfo &MOI,
681                    MachineBasicBlock::iterator &MI);
682   /// Expands atomic fence operation \p MI. Returns true if
683   /// instructions are added/deleted or \p MI is modified, false otherwise.
684   bool expandAtomicFence(const SIMemOpInfo &MOI,
685                          MachineBasicBlock::iterator &MI);
686   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
687   /// instructions are added/deleted or \p MI is modified, false otherwise.
688   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
689                                 MachineBasicBlock::iterator &MI);
690 
691 public:
692   static char ID;
693 
694   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
695 
696   void getAnalysisUsage(AnalysisUsage &AU) const override {
697     AU.setPreservesCFG();
698     MachineFunctionPass::getAnalysisUsage(AU);
699   }
700 
701   StringRef getPassName() const override {
702     return PASS_NAME;
703   }
704 
705   bool runOnMachineFunction(MachineFunction &MF) override;
706 };
707 
708 static const StringMap<SIAtomicAddrSpace> ASNames = {{
709     {"global", SIAtomicAddrSpace::GLOBAL},
710     {"local", SIAtomicAddrSpace::LDS},
711 }};
712 
713 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714   const MachineFunction *MF = MI.getMF();
715   const Function &Fn = MF->getFunction();
716   SmallString<128> Str;
717   raw_svector_ostream OS(Str);
718   OS << "unknown address space '" << AS << "'; expected one of ";
719   ListSeparator LS;
720   for (const auto &[Name, Val] : ASNames)
721     OS << LS << '\'' << Name << '\'';
722   DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723   Fn.getContext().diagnose(BadTag);
724 }
725 
726 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727 /// If this tag isn't present, or if it has no meaningful values, returns \p
728 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
729 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730                                                SIAtomicAddrSpace Default) {
731   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732 
733   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
734   if (!MMRA)
735     return Default;
736 
737   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738   for (const auto &[Prefix, Suffix] : MMRA) {
739     if (Prefix != FenceASPrefix)
740       continue;
741 
742     if (auto It = ASNames.find(Suffix); It != ASNames.end())
743       Result |= It->second;
744     else
745       diagnoseUnknownMMRAASName(MI, Suffix);
746   }
747 
748   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749 }
750 
751 } // end namespace anonymous
752 
753 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
754                                       const char *Msg) const {
755   const Function &Func = MI->getParent()->getParent()->getFunction();
756   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
757   Func.getContext().diagnose(Diag);
758 }
759 
760 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
761 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762                                SIAtomicAddrSpace InstrAddrSpace) const {
763   if (SSID == SyncScope::System)
764     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
765   if (SSID == MMI->getAgentSSID())
766     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
767   if (SSID == MMI->getWorkgroupSSID())
768     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
769                       true);
770   if (SSID == MMI->getWavefrontSSID())
771     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
772                       true);
773   if (SSID == SyncScope::SingleThread)
774     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
775                       true);
776   if (SSID == MMI->getSystemOneAddressSpaceSSID())
777     return std::tuple(SIAtomicScope::SYSTEM,
778                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
779   if (SSID == MMI->getAgentOneAddressSpaceSSID())
780     return std::tuple(SIAtomicScope::AGENT,
781                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
782   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783     return std::tuple(SIAtomicScope::WORKGROUP,
784                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
785   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786     return std::tuple(SIAtomicScope::WAVEFRONT,
787                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
788   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789     return std::tuple(SIAtomicScope::SINGLETHREAD,
790                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791   return std::nullopt;
792 }
793 
794 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
795   if (AS == AMDGPUAS::FLAT_ADDRESS)
796     return SIAtomicAddrSpace::FLAT;
797   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
798     return SIAtomicAddrSpace::GLOBAL;
799   if (AS == AMDGPUAS::LOCAL_ADDRESS)
800     return SIAtomicAddrSpace::LDS;
801   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
802     return SIAtomicAddrSpace::SCRATCH;
803   if (AS == AMDGPUAS::REGION_ADDRESS)
804     return SIAtomicAddrSpace::GDS;
805 
806   return SIAtomicAddrSpace::OTHER;
807 }
808 
809 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
810   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
811 }
812 
813 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
814     const MachineBasicBlock::iterator &MI) const {
815   assert(MI->getNumMemOperands() > 0);
816 
817   SyncScope::ID SSID = SyncScope::SingleThread;
818   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
819   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
820   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
821   bool IsNonTemporal = true;
822   bool IsVolatile = false;
823   bool IsLastUse = false;
824 
825   // Validator should check whether or not MMOs cover the entire set of
826   // locations accessed by the memory instruction.
827   for (const auto &MMO : MI->memoperands()) {
828     IsNonTemporal &= MMO->isNonTemporal();
829     IsVolatile |= MMO->isVolatile();
830     IsLastUse |= MMO->getFlags() & MOLastUse;
831     InstrAddrSpace |=
832       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
833     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
834     if (OpOrdering != AtomicOrdering::NotAtomic) {
835       const auto &IsSyncScopeInclusion =
836           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
837       if (!IsSyncScopeInclusion) {
838         reportUnsupported(MI,
839           "Unsupported non-inclusive atomic synchronization scope");
840         return std::nullopt;
841       }
842 
843       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
844       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
845       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
846              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
847       FailureOrdering =
848           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
849     }
850   }
851 
852   SIAtomicScope Scope = SIAtomicScope::NONE;
853   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
854   bool IsCrossAddressSpaceOrdering = false;
855   if (Ordering != AtomicOrdering::NotAtomic) {
856     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
857     if (!ScopeOrNone) {
858       reportUnsupported(MI, "Unsupported atomic synchronization scope");
859       return std::nullopt;
860     }
861     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
862         *ScopeOrNone;
863     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
864         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
865         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
866       reportUnsupported(MI, "Unsupported atomic address space");
867       return std::nullopt;
868     }
869   }
870   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
871                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
872                      IsNonTemporal, IsLastUse);
873 }
874 
875 std::optional<SIMemOpInfo>
876 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
877   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
878 
879   if (!(MI->mayLoad() && !MI->mayStore()))
880     return std::nullopt;
881 
882   // Be conservative if there are no memory operands.
883   if (MI->getNumMemOperands() == 0)
884     return SIMemOpInfo();
885 
886   return constructFromMIWithMMO(MI);
887 }
888 
889 std::optional<SIMemOpInfo>
890 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
891   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
892 
893   if (!(!MI->mayLoad() && MI->mayStore()))
894     return std::nullopt;
895 
896   // Be conservative if there are no memory operands.
897   if (MI->getNumMemOperands() == 0)
898     return SIMemOpInfo();
899 
900   return constructFromMIWithMMO(MI);
901 }
902 
903 std::optional<SIMemOpInfo>
904 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
905   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
906 
907   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
908     return std::nullopt;
909 
910   AtomicOrdering Ordering =
911     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
912 
913   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
914   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
915   if (!ScopeOrNone) {
916     reportUnsupported(MI, "Unsupported atomic synchronization scope");
917     return std::nullopt;
918   }
919 
920   SIAtomicScope Scope = SIAtomicScope::NONE;
921   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
922   bool IsCrossAddressSpaceOrdering = false;
923   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
924       *ScopeOrNone;
925 
926   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
927       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
928     reportUnsupported(MI, "Unsupported atomic address space");
929     return std::nullopt;
930   }
931 
932   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
933                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
934 }
935 
936 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
937     const MachineBasicBlock::iterator &MI) const {
938   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
939 
940   if (!(MI->mayLoad() && MI->mayStore()))
941     return std::nullopt;
942 
943   // Be conservative if there are no memory operands.
944   if (MI->getNumMemOperands() == 0)
945     return SIMemOpInfo();
946 
947   return constructFromMIWithMMO(MI);
948 }
949 
950 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
951   TII = ST.getInstrInfo();
952   IV = getIsaVersion(ST.getCPU());
953   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
954 }
955 
956 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
957                                     AMDGPU::CPol::CPol Bit) const {
958   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
959   if (!CPol)
960     return false;
961 
962   CPol->setImm(CPol->getImm() | Bit);
963   return true;
964 }
965 
966 /* static */
967 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
968   GCNSubtarget::Generation Generation = ST.getGeneration();
969   if (ST.hasGFX940Insts())
970     return std::make_unique<SIGfx940CacheControl>(ST);
971   if (ST.hasGFX90AInsts())
972     return std::make_unique<SIGfx90ACacheControl>(ST);
973   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
974     return std::make_unique<SIGfx6CacheControl>(ST);
975   if (Generation < AMDGPUSubtarget::GFX10)
976     return std::make_unique<SIGfx7CacheControl>(ST);
977   if (Generation < AMDGPUSubtarget::GFX11)
978     return std::make_unique<SIGfx10CacheControl>(ST);
979   if (Generation < AMDGPUSubtarget::GFX12)
980     return std::make_unique<SIGfx11CacheControl>(ST);
981   return std::make_unique<SIGfx12CacheControl>(ST);
982 }
983 
984 bool SIGfx6CacheControl::enableLoadCacheBypass(
985     const MachineBasicBlock::iterator &MI,
986     SIAtomicScope Scope,
987     SIAtomicAddrSpace AddrSpace) const {
988   assert(MI->mayLoad() && !MI->mayStore());
989   bool Changed = false;
990 
991   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
992     switch (Scope) {
993     case SIAtomicScope::SYSTEM:
994     case SIAtomicScope::AGENT:
995       // Set L1 cache policy to MISS_EVICT.
996       // Note: there is no L2 cache bypass policy at the ISA level.
997       Changed |= enableGLCBit(MI);
998       break;
999     case SIAtomicScope::WORKGROUP:
1000     case SIAtomicScope::WAVEFRONT:
1001     case SIAtomicScope::SINGLETHREAD:
1002       // No cache to bypass.
1003       break;
1004     default:
1005       llvm_unreachable("Unsupported synchronization scope");
1006     }
1007   }
1008 
1009   /// The scratch address space does not need the global memory caches
1010   /// to be bypassed as all memory operations by the same thread are
1011   /// sequentially consistent, and no other thread can access scratch
1012   /// memory.
1013 
1014   /// Other address spaces do not have a cache.
1015 
1016   return Changed;
1017 }
1018 
1019 bool SIGfx6CacheControl::enableStoreCacheBypass(
1020     const MachineBasicBlock::iterator &MI,
1021     SIAtomicScope Scope,
1022     SIAtomicAddrSpace AddrSpace) const {
1023   assert(!MI->mayLoad() && MI->mayStore());
1024   bool Changed = false;
1025 
1026   /// The L1 cache is write through so does not need to be bypassed. There is no
1027   /// bypass control for the L2 cache at the isa level.
1028 
1029   return Changed;
1030 }
1031 
1032 bool SIGfx6CacheControl::enableRMWCacheBypass(
1033     const MachineBasicBlock::iterator &MI,
1034     SIAtomicScope Scope,
1035     SIAtomicAddrSpace AddrSpace) const {
1036   assert(MI->mayLoad() && MI->mayStore());
1037   bool Changed = false;
1038 
1039   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1040   /// bypassed, and the GLC bit is instead used to indicate if they are
1041   /// return or no-return.
1042   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1043 
1044   return Changed;
1045 }
1046 
1047 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1048     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1049     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1050   // Only handle load and store, not atomic read-modify-write insructions. The
1051   // latter use glc to indicate if the atomic returns a result and so must not
1052   // be used for cache control.
1053   assert(MI->mayLoad() ^ MI->mayStore());
1054 
1055   // Only update load and store, not LLVM IR atomic read-modify-write
1056   // instructions. The latter are always marked as volatile so cannot sensibly
1057   // handle it as do not want to pessimize all atomics. Also they do not support
1058   // the nontemporal attribute.
1059   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1060 
1061   bool Changed = false;
1062 
1063   if (IsVolatile) {
1064     // Set L1 cache policy to be MISS_EVICT for load instructions
1065     // and MISS_LRU for store instructions.
1066     // Note: there is no L2 cache bypass policy at the ISA level.
1067     if (Op == SIMemOp::LOAD)
1068       Changed |= enableGLCBit(MI);
1069 
1070     // Ensure operation has completed at system scope to cause all volatile
1071     // operations to be visible outside the program in a global order. Do not
1072     // request cross address space as only the global address space can be
1073     // observable outside the program, so no need to cause a waitcnt for LDS
1074     // address space operations.
1075     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1076                           Position::AFTER);
1077 
1078     return Changed;
1079   }
1080 
1081   if (IsNonTemporal) {
1082     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1083     // for both loads and stores, and the L2 cache policy to STREAM.
1084     Changed |= enableGLCBit(MI);
1085     Changed |= enableSLCBit(MI);
1086     return Changed;
1087   }
1088 
1089   return Changed;
1090 }
1091 
1092 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1093                                     SIAtomicScope Scope,
1094                                     SIAtomicAddrSpace AddrSpace,
1095                                     SIMemOp Op,
1096                                     bool IsCrossAddrSpaceOrdering,
1097                                     Position Pos) const {
1098   bool Changed = false;
1099 
1100   MachineBasicBlock &MBB = *MI->getParent();
1101   DebugLoc DL = MI->getDebugLoc();
1102 
1103   if (Pos == Position::AFTER)
1104     ++MI;
1105 
1106   bool VMCnt = false;
1107   bool LGKMCnt = false;
1108 
1109   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1110       SIAtomicAddrSpace::NONE) {
1111     switch (Scope) {
1112     case SIAtomicScope::SYSTEM:
1113     case SIAtomicScope::AGENT:
1114       VMCnt |= true;
1115       break;
1116     case SIAtomicScope::WORKGROUP:
1117     case SIAtomicScope::WAVEFRONT:
1118     case SIAtomicScope::SINGLETHREAD:
1119       // The L1 cache keeps all memory operations in order for
1120       // wavefronts in the same work-group.
1121       break;
1122     default:
1123       llvm_unreachable("Unsupported synchronization scope");
1124     }
1125   }
1126 
1127   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1128     switch (Scope) {
1129     case SIAtomicScope::SYSTEM:
1130     case SIAtomicScope::AGENT:
1131     case SIAtomicScope::WORKGROUP:
1132       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1133       // not needed as LDS operations for all waves are executed in a total
1134       // global ordering as observed by all waves. Required if also
1135       // synchronizing with global/GDS memory as LDS operations could be
1136       // reordered with respect to later global/GDS memory operations of the
1137       // same wave.
1138       LGKMCnt |= IsCrossAddrSpaceOrdering;
1139       break;
1140     case SIAtomicScope::WAVEFRONT:
1141     case SIAtomicScope::SINGLETHREAD:
1142       // The LDS keeps all memory operations in order for
1143       // the same wavefront.
1144       break;
1145     default:
1146       llvm_unreachable("Unsupported synchronization scope");
1147     }
1148   }
1149 
1150   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1151     switch (Scope) {
1152     case SIAtomicScope::SYSTEM:
1153     case SIAtomicScope::AGENT:
1154       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1155       // is not needed as GDS operations for all waves are executed in a total
1156       // global ordering as observed by all waves. Required if also
1157       // synchronizing with global/LDS memory as GDS operations could be
1158       // reordered with respect to later global/LDS memory operations of the
1159       // same wave.
1160       LGKMCnt |= IsCrossAddrSpaceOrdering;
1161       break;
1162     case SIAtomicScope::WORKGROUP:
1163     case SIAtomicScope::WAVEFRONT:
1164     case SIAtomicScope::SINGLETHREAD:
1165       // The GDS keeps all memory operations in order for
1166       // the same work-group.
1167       break;
1168     default:
1169       llvm_unreachable("Unsupported synchronization scope");
1170     }
1171   }
1172 
1173   if (VMCnt || LGKMCnt) {
1174     unsigned WaitCntImmediate =
1175       AMDGPU::encodeWaitcnt(IV,
1176                             VMCnt ? 0 : getVmcntBitMask(IV),
1177                             getExpcntBitMask(IV),
1178                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1179     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1180         .addImm(WaitCntImmediate);
1181     Changed = true;
1182   }
1183 
1184   if (Pos == Position::AFTER)
1185     --MI;
1186 
1187   return Changed;
1188 }
1189 
1190 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1191                                        SIAtomicScope Scope,
1192                                        SIAtomicAddrSpace AddrSpace,
1193                                        Position Pos) const {
1194   if (!InsertCacheInv)
1195     return false;
1196 
1197   bool Changed = false;
1198 
1199   MachineBasicBlock &MBB = *MI->getParent();
1200   DebugLoc DL = MI->getDebugLoc();
1201 
1202   if (Pos == Position::AFTER)
1203     ++MI;
1204 
1205   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1206     switch (Scope) {
1207     case SIAtomicScope::SYSTEM:
1208     case SIAtomicScope::AGENT:
1209       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1210       Changed = true;
1211       break;
1212     case SIAtomicScope::WORKGROUP:
1213     case SIAtomicScope::WAVEFRONT:
1214     case SIAtomicScope::SINGLETHREAD:
1215       // No cache to invalidate.
1216       break;
1217     default:
1218       llvm_unreachable("Unsupported synchronization scope");
1219     }
1220   }
1221 
1222   /// The scratch address space does not need the global memory cache
1223   /// to be flushed as all memory operations by the same thread are
1224   /// sequentially consistent, and no other thread can access scratch
1225   /// memory.
1226 
1227   /// Other address spaces do not have a cache.
1228 
1229   if (Pos == Position::AFTER)
1230     --MI;
1231 
1232   return Changed;
1233 }
1234 
1235 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1236                                        SIAtomicScope Scope,
1237                                        SIAtomicAddrSpace AddrSpace,
1238                                        bool IsCrossAddrSpaceOrdering,
1239                                        Position Pos) const {
1240   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1241                     IsCrossAddrSpaceOrdering, Pos);
1242 }
1243 
1244 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1245                                        SIAtomicScope Scope,
1246                                        SIAtomicAddrSpace AddrSpace,
1247                                        Position Pos) const {
1248   if (!InsertCacheInv)
1249     return false;
1250 
1251   bool Changed = false;
1252 
1253   MachineBasicBlock &MBB = *MI->getParent();
1254   DebugLoc DL = MI->getDebugLoc();
1255 
1256   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1257 
1258   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1259                                     ? AMDGPU::BUFFER_WBINVL1
1260                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1261 
1262   if (Pos == Position::AFTER)
1263     ++MI;
1264 
1265   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1266     switch (Scope) {
1267     case SIAtomicScope::SYSTEM:
1268     case SIAtomicScope::AGENT:
1269       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1270       Changed = true;
1271       break;
1272     case SIAtomicScope::WORKGROUP:
1273     case SIAtomicScope::WAVEFRONT:
1274     case SIAtomicScope::SINGLETHREAD:
1275       // No cache to invalidate.
1276       break;
1277     default:
1278       llvm_unreachable("Unsupported synchronization scope");
1279     }
1280   }
1281 
1282   /// The scratch address space does not need the global memory cache
1283   /// to be flushed as all memory operations by the same thread are
1284   /// sequentially consistent, and no other thread can access scratch
1285   /// memory.
1286 
1287   /// Other address spaces do not have a cache.
1288 
1289   if (Pos == Position::AFTER)
1290     --MI;
1291 
1292   return Changed;
1293 }
1294 
1295 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1296     const MachineBasicBlock::iterator &MI,
1297     SIAtomicScope Scope,
1298     SIAtomicAddrSpace AddrSpace) const {
1299   assert(MI->mayLoad() && !MI->mayStore());
1300   bool Changed = false;
1301 
1302   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1303     switch (Scope) {
1304     case SIAtomicScope::SYSTEM:
1305     case SIAtomicScope::AGENT:
1306       // Set the L1 cache policy to MISS_LRU.
1307       // Note: there is no L2 cache bypass policy at the ISA level.
1308       Changed |= enableGLCBit(MI);
1309       break;
1310     case SIAtomicScope::WORKGROUP:
1311       // In threadgroup split mode the waves of a work-group can be executing on
1312       // different CUs. Therefore need to bypass the L1 which is per CU.
1313       // Otherwise in non-threadgroup split mode all waves of a work-group are
1314       // on the same CU, and so the L1 does not need to be bypassed.
1315       if (ST.isTgSplitEnabled())
1316         Changed |= enableGLCBit(MI);
1317       break;
1318     case SIAtomicScope::WAVEFRONT:
1319     case SIAtomicScope::SINGLETHREAD:
1320       // No cache to bypass.
1321       break;
1322     default:
1323       llvm_unreachable("Unsupported synchronization scope");
1324     }
1325   }
1326 
1327   /// The scratch address space does not need the global memory caches
1328   /// to be bypassed as all memory operations by the same thread are
1329   /// sequentially consistent, and no other thread can access scratch
1330   /// memory.
1331 
1332   /// Other address spaces do not have a cache.
1333 
1334   return Changed;
1335 }
1336 
1337 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1338     const MachineBasicBlock::iterator &MI,
1339     SIAtomicScope Scope,
1340     SIAtomicAddrSpace AddrSpace) const {
1341   assert(!MI->mayLoad() && MI->mayStore());
1342   bool Changed = false;
1343 
1344   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1345     switch (Scope) {
1346     case SIAtomicScope::SYSTEM:
1347     case SIAtomicScope::AGENT:
1348       /// Do not set glc for store atomic operations as they implicitly write
1349       /// through the L1 cache.
1350       break;
1351     case SIAtomicScope::WORKGROUP:
1352     case SIAtomicScope::WAVEFRONT:
1353     case SIAtomicScope::SINGLETHREAD:
1354       // No cache to bypass. Store atomics implicitly write through the L1
1355       // cache.
1356       break;
1357     default:
1358       llvm_unreachable("Unsupported synchronization scope");
1359     }
1360   }
1361 
1362   /// The scratch address space does not need the global memory caches
1363   /// to be bypassed as all memory operations by the same thread are
1364   /// sequentially consistent, and no other thread can access scratch
1365   /// memory.
1366 
1367   /// Other address spaces do not have a cache.
1368 
1369   return Changed;
1370 }
1371 
1372 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1373     const MachineBasicBlock::iterator &MI,
1374     SIAtomicScope Scope,
1375     SIAtomicAddrSpace AddrSpace) const {
1376   assert(MI->mayLoad() && MI->mayStore());
1377   bool Changed = false;
1378 
1379   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1380     switch (Scope) {
1381     case SIAtomicScope::SYSTEM:
1382     case SIAtomicScope::AGENT:
1383       /// Do not set glc for RMW atomic operations as they implicitly bypass
1384       /// the L1 cache, and the glc bit is instead used to indicate if they are
1385       /// return or no-return.
1386       break;
1387     case SIAtomicScope::WORKGROUP:
1388     case SIAtomicScope::WAVEFRONT:
1389     case SIAtomicScope::SINGLETHREAD:
1390       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1391       break;
1392     default:
1393       llvm_unreachable("Unsupported synchronization scope");
1394     }
1395   }
1396 
1397   return Changed;
1398 }
1399 
1400 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1401     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1402     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1403   // Only handle load and store, not atomic read-modify-write insructions. The
1404   // latter use glc to indicate if the atomic returns a result and so must not
1405   // be used for cache control.
1406   assert(MI->mayLoad() ^ MI->mayStore());
1407 
1408   // Only update load and store, not LLVM IR atomic read-modify-write
1409   // instructions. The latter are always marked as volatile so cannot sensibly
1410   // handle it as do not want to pessimize all atomics. Also they do not support
1411   // the nontemporal attribute.
1412   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1413 
1414   bool Changed = false;
1415 
1416   if (IsVolatile) {
1417     // Set L1 cache policy to be MISS_EVICT for load instructions
1418     // and MISS_LRU for store instructions.
1419     // Note: there is no L2 cache bypass policy at the ISA level.
1420     if (Op == SIMemOp::LOAD)
1421       Changed |= enableGLCBit(MI);
1422 
1423     // Ensure operation has completed at system scope to cause all volatile
1424     // operations to be visible outside the program in a global order. Do not
1425     // request cross address space as only the global address space can be
1426     // observable outside the program, so no need to cause a waitcnt for LDS
1427     // address space operations.
1428     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1429                           Position::AFTER);
1430 
1431     return Changed;
1432   }
1433 
1434   if (IsNonTemporal) {
1435     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1436     // for both loads and stores, and the L2 cache policy to STREAM.
1437     Changed |= enableGLCBit(MI);
1438     Changed |= enableSLCBit(MI);
1439     return Changed;
1440   }
1441 
1442   return Changed;
1443 }
1444 
1445 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1446                                       SIAtomicScope Scope,
1447                                       SIAtomicAddrSpace AddrSpace,
1448                                       SIMemOp Op,
1449                                       bool IsCrossAddrSpaceOrdering,
1450                                       Position Pos) const {
1451   if (ST.isTgSplitEnabled()) {
1452     // In threadgroup split mode the waves of a work-group can be executing on
1453     // different CUs. Therefore need to wait for global or GDS memory operations
1454     // to complete to ensure they are visible to waves in the other CUs.
1455     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1456     // the same CU, so no need to wait for global memory as all waves in the
1457     // work-group access the same the L1, nor wait for GDS as access are ordered
1458     // on a CU.
1459     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1460                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1461         (Scope == SIAtomicScope::WORKGROUP)) {
1462       // Same as GFX7 using agent scope.
1463       Scope = SIAtomicScope::AGENT;
1464     }
1465     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1466     // LDS memory operations.
1467     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1468   }
1469   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1470                                         IsCrossAddrSpaceOrdering, Pos);
1471 }
1472 
1473 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1474                                          SIAtomicScope Scope,
1475                                          SIAtomicAddrSpace AddrSpace,
1476                                          Position Pos) const {
1477   if (!InsertCacheInv)
1478     return false;
1479 
1480   bool Changed = false;
1481 
1482   MachineBasicBlock &MBB = *MI->getParent();
1483   DebugLoc DL = MI->getDebugLoc();
1484 
1485   if (Pos == Position::AFTER)
1486     ++MI;
1487 
1488   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1489     switch (Scope) {
1490     case SIAtomicScope::SYSTEM:
1491       // Ensures that following loads will not see stale remote VMEM data or
1492       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1493       // CC will never be stale due to the local memory probes.
1494       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1495       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1496       // hardware does not reorder memory operations by the same wave with
1497       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1498       // remove any cache lines of earlier writes by the same wave and ensures
1499       // later reads by the same wave will refetch the cache lines.
1500       Changed = true;
1501       break;
1502     case SIAtomicScope::AGENT:
1503       // Same as GFX7.
1504       break;
1505     case SIAtomicScope::WORKGROUP:
1506       // In threadgroup split mode the waves of a work-group can be executing on
1507       // different CUs. Therefore need to invalidate the L1 which is per CU.
1508       // Otherwise in non-threadgroup split mode all waves of a work-group are
1509       // on the same CU, and so the L1 does not need to be invalidated.
1510       if (ST.isTgSplitEnabled()) {
1511         // Same as GFX7 using agent scope.
1512         Scope = SIAtomicScope::AGENT;
1513       }
1514       break;
1515     case SIAtomicScope::WAVEFRONT:
1516     case SIAtomicScope::SINGLETHREAD:
1517       // Same as GFX7.
1518       break;
1519     default:
1520       llvm_unreachable("Unsupported synchronization scope");
1521     }
1522   }
1523 
1524   /// The scratch address space does not need the global memory cache
1525   /// to be flushed as all memory operations by the same thread are
1526   /// sequentially consistent, and no other thread can access scratch
1527   /// memory.
1528 
1529   /// Other address spaces do not have a cache.
1530 
1531   if (Pos == Position::AFTER)
1532     --MI;
1533 
1534   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1535 
1536   return Changed;
1537 }
1538 
1539 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1540                                          SIAtomicScope Scope,
1541                                          SIAtomicAddrSpace AddrSpace,
1542                                          bool IsCrossAddrSpaceOrdering,
1543                                          Position Pos) const {
1544   bool Changed = false;
1545 
1546   MachineBasicBlock &MBB = *MI->getParent();
1547   const DebugLoc &DL = MI->getDebugLoc();
1548 
1549   if (Pos == Position::AFTER)
1550     ++MI;
1551 
1552   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1553     switch (Scope) {
1554     case SIAtomicScope::SYSTEM:
1555       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1556       // hardware does not reorder memory operations by the same wave with
1557       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1558       // to initiate writeback of any dirty cache lines of earlier writes by the
1559       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1560       // writeback has completed.
1561       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1562         // Set SC bits to indicate system scope.
1563         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1564       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1565       // vmcnt(0)" needed by the "BUFFER_WBL2".
1566       Changed = true;
1567       break;
1568     case SIAtomicScope::AGENT:
1569     case SIAtomicScope::WORKGROUP:
1570     case SIAtomicScope::WAVEFRONT:
1571     case SIAtomicScope::SINGLETHREAD:
1572       // Same as GFX7.
1573       break;
1574     default:
1575       llvm_unreachable("Unsupported synchronization scope");
1576     }
1577   }
1578 
1579   if (Pos == Position::AFTER)
1580     --MI;
1581 
1582   Changed |=
1583       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1584                                         IsCrossAddrSpaceOrdering, Pos);
1585 
1586   return Changed;
1587 }
1588 
1589 bool SIGfx940CacheControl::enableLoadCacheBypass(
1590     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1591     SIAtomicAddrSpace AddrSpace) const {
1592   assert(MI->mayLoad() && !MI->mayStore());
1593   bool Changed = false;
1594 
1595   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1596     switch (Scope) {
1597     case SIAtomicScope::SYSTEM:
1598       // Set SC bits to indicate system scope.
1599       Changed |= enableSC0Bit(MI);
1600       Changed |= enableSC1Bit(MI);
1601       break;
1602     case SIAtomicScope::AGENT:
1603       // Set SC bits to indicate agent scope.
1604       Changed |= enableSC1Bit(MI);
1605       break;
1606     case SIAtomicScope::WORKGROUP:
1607       // In threadgroup split mode the waves of a work-group can be executing on
1608       // different CUs. Therefore need to bypass the L1 which is per CU.
1609       // Otherwise in non-threadgroup split mode all waves of a work-group are
1610       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1611       // bits to indicate work-group scope will do this automatically.
1612       Changed |= enableSC0Bit(MI);
1613       break;
1614     case SIAtomicScope::WAVEFRONT:
1615     case SIAtomicScope::SINGLETHREAD:
1616       // Leave SC bits unset to indicate wavefront scope.
1617       break;
1618     default:
1619       llvm_unreachable("Unsupported synchronization scope");
1620     }
1621   }
1622 
1623   /// The scratch address space does not need the global memory caches
1624   /// to be bypassed as all memory operations by the same thread are
1625   /// sequentially consistent, and no other thread can access scratch
1626   /// memory.
1627 
1628   /// Other address spaces do not have a cache.
1629 
1630   return Changed;
1631 }
1632 
1633 bool SIGfx940CacheControl::enableStoreCacheBypass(
1634     const MachineBasicBlock::iterator &MI,
1635     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1636   assert(!MI->mayLoad() && MI->mayStore());
1637   bool Changed = false;
1638 
1639   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1640     switch (Scope) {
1641     case SIAtomicScope::SYSTEM:
1642       // Set SC bits to indicate system scope.
1643       Changed |= enableSC0Bit(MI);
1644       Changed |= enableSC1Bit(MI);
1645       break;
1646     case SIAtomicScope::AGENT:
1647       // Set SC bits to indicate agent scope.
1648       Changed |= enableSC1Bit(MI);
1649       break;
1650     case SIAtomicScope::WORKGROUP:
1651       // Set SC bits to indicate workgroup scope.
1652       Changed |= enableSC0Bit(MI);
1653       break;
1654     case SIAtomicScope::WAVEFRONT:
1655     case SIAtomicScope::SINGLETHREAD:
1656       // Leave SC bits unset to indicate wavefront scope.
1657       break;
1658     default:
1659       llvm_unreachable("Unsupported synchronization scope");
1660     }
1661   }
1662 
1663   /// The scratch address space does not need the global memory caches
1664   /// to be bypassed as all memory operations by the same thread are
1665   /// sequentially consistent, and no other thread can access scratch
1666   /// memory.
1667 
1668   /// Other address spaces do not have a cache.
1669 
1670   return Changed;
1671 }
1672 
1673 bool SIGfx940CacheControl::enableRMWCacheBypass(
1674     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1675     SIAtomicAddrSpace AddrSpace) const {
1676   assert(MI->mayLoad() && MI->mayStore());
1677   bool Changed = false;
1678 
1679   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1680     switch (Scope) {
1681     case SIAtomicScope::SYSTEM:
1682       // Set SC1 bit to indicate system scope.
1683       Changed |= enableSC1Bit(MI);
1684       break;
1685     case SIAtomicScope::AGENT:
1686     case SIAtomicScope::WORKGROUP:
1687     case SIAtomicScope::WAVEFRONT:
1688     case SIAtomicScope::SINGLETHREAD:
1689       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1690       // to indicate system or agent scope. The SC0 bit is used to indicate if
1691       // they are return or no-return. Leave SC1 bit unset to indicate agent
1692       // scope.
1693       break;
1694     default:
1695       llvm_unreachable("Unsupported synchronization scope");
1696     }
1697   }
1698 
1699   return Changed;
1700 }
1701 
1702 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1703     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1704     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1705   // Only handle load and store, not atomic read-modify-write insructions. The
1706   // latter use glc to indicate if the atomic returns a result and so must not
1707   // be used for cache control.
1708   assert(MI->mayLoad() ^ MI->mayStore());
1709 
1710   // Only update load and store, not LLVM IR atomic read-modify-write
1711   // instructions. The latter are always marked as volatile so cannot sensibly
1712   // handle it as do not want to pessimize all atomics. Also they do not support
1713   // the nontemporal attribute.
1714   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1715 
1716   bool Changed = false;
1717 
1718   if (IsVolatile) {
1719     // Set SC bits to indicate system scope.
1720     Changed |= enableSC0Bit(MI);
1721     Changed |= enableSC1Bit(MI);
1722 
1723     // Ensure operation has completed at system scope to cause all volatile
1724     // operations to be visible outside the program in a global order. Do not
1725     // request cross address space as only the global address space can be
1726     // observable outside the program, so no need to cause a waitcnt for LDS
1727     // address space operations.
1728     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1729                           Position::AFTER);
1730 
1731     return Changed;
1732   }
1733 
1734   if (IsNonTemporal) {
1735     Changed |= enableNTBit(MI);
1736     return Changed;
1737   }
1738 
1739   return Changed;
1740 }
1741 
1742 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1743                                          SIAtomicScope Scope,
1744                                          SIAtomicAddrSpace AddrSpace,
1745                                          Position Pos) const {
1746   if (!InsertCacheInv)
1747     return false;
1748 
1749   bool Changed = false;
1750 
1751   MachineBasicBlock &MBB = *MI->getParent();
1752   DebugLoc DL = MI->getDebugLoc();
1753 
1754   if (Pos == Position::AFTER)
1755     ++MI;
1756 
1757   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1758     switch (Scope) {
1759     case SIAtomicScope::SYSTEM:
1760       // Ensures that following loads will not see stale remote VMEM data or
1761       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1762       // CC will never be stale due to the local memory probes.
1763       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1764           // Set SC bits to indicate system scope.
1765           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1766       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1767       // hardware does not reorder memory operations by the same wave with
1768       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1769       // remove any cache lines of earlier writes by the same wave and ensures
1770       // later reads by the same wave will refetch the cache lines.
1771       Changed = true;
1772       break;
1773     case SIAtomicScope::AGENT:
1774       // Ensures that following loads will not see stale remote date or local
1775       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1776       // due to the memory probes.
1777       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1778           // Set SC bits to indicate agent scope.
1779           .addImm(AMDGPU::CPol::SC1);
1780       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1781       // does not reorder memory operations with respect to preceeding buffer
1782       // invalidate. The invalidate is guaranteed to remove any cache lines of
1783       // earlier writes and ensures later writes will refetch the cache lines.
1784       Changed = true;
1785       break;
1786     case SIAtomicScope::WORKGROUP:
1787       // In threadgroup split mode the waves of a work-group can be executing on
1788       // different CUs. Therefore need to invalidate the L1 which is per CU.
1789       // Otherwise in non-threadgroup split mode all waves of a work-group are
1790       // on the same CU, and so the L1 does not need to be invalidated.
1791       if (ST.isTgSplitEnabled()) {
1792         // Ensures L1 is invalidated if in threadgroup split mode. In
1793         // non-threadgroup split mode it is a NOP, but no point generating it in
1794         // that case if know not in that mode.
1795         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1796             // Set SC bits to indicate work-group scope.
1797             .addImm(AMDGPU::CPol::SC0);
1798         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1799         // does not reorder memory operations with respect to preceeding buffer
1800         // invalidate. The invalidate is guaranteed to remove any cache lines of
1801         // earlier writes and ensures later writes will refetch the cache lines.
1802         Changed = true;
1803       }
1804       break;
1805     case SIAtomicScope::WAVEFRONT:
1806     case SIAtomicScope::SINGLETHREAD:
1807       // Could generate "BUFFER_INV" but it would do nothing as there are no
1808       // caches to invalidate.
1809       break;
1810     default:
1811       llvm_unreachable("Unsupported synchronization scope");
1812     }
1813   }
1814 
1815   /// The scratch address space does not need the global memory cache
1816   /// to be flushed as all memory operations by the same thread are
1817   /// sequentially consistent, and no other thread can access scratch
1818   /// memory.
1819 
1820   /// Other address spaces do not have a cache.
1821 
1822   if (Pos == Position::AFTER)
1823     --MI;
1824 
1825   return Changed;
1826 }
1827 
1828 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1829                                          SIAtomicScope Scope,
1830                                          SIAtomicAddrSpace AddrSpace,
1831                                          bool IsCrossAddrSpaceOrdering,
1832                                          Position Pos) const {
1833   bool Changed = false;
1834 
1835   MachineBasicBlock &MBB = *MI->getParent();
1836   DebugLoc DL = MI->getDebugLoc();
1837 
1838   if (Pos == Position::AFTER)
1839     ++MI;
1840 
1841   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1842     switch (Scope) {
1843     case SIAtomicScope::SYSTEM:
1844       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1845       // hardware does not reorder memory operations by the same wave with
1846       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1847       // to initiate writeback of any dirty cache lines of earlier writes by the
1848       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1849       // writeback has completed.
1850       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1851           // Set SC bits to indicate system scope.
1852           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1853       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1854       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1855       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1856       Changed = true;
1857       break;
1858     case SIAtomicScope::AGENT:
1859       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1860           // Set SC bits to indicate agent scope.
1861           .addImm(AMDGPU::CPol::SC1);
1862 
1863       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1864       // SIAtomicScope::AGENT, the following insertWait will generate the
1865       // required "S_WAITCNT vmcnt(0)".
1866       Changed = true;
1867       break;
1868     case SIAtomicScope::WORKGROUP:
1869     case SIAtomicScope::WAVEFRONT:
1870     case SIAtomicScope::SINGLETHREAD:
1871       // Do not generate "BUFFER_WBL2" as there are no caches it would
1872       // writeback, and would require an otherwise unnecessary
1873       // "S_WAITCNT vmcnt(0)".
1874       break;
1875     default:
1876       llvm_unreachable("Unsupported synchronization scope");
1877     }
1878   }
1879 
1880   if (Pos == Position::AFTER)
1881     --MI;
1882 
1883   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1884   // S_WAITCNT needed.
1885   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1886                         IsCrossAddrSpaceOrdering, Pos);
1887 
1888   return Changed;
1889 }
1890 
1891 bool SIGfx10CacheControl::enableLoadCacheBypass(
1892     const MachineBasicBlock::iterator &MI,
1893     SIAtomicScope Scope,
1894     SIAtomicAddrSpace AddrSpace) const {
1895   assert(MI->mayLoad() && !MI->mayStore());
1896   bool Changed = false;
1897 
1898   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1899     switch (Scope) {
1900     case SIAtomicScope::SYSTEM:
1901     case SIAtomicScope::AGENT:
1902       // Set the L0 and L1 cache policies to MISS_EVICT.
1903       // Note: there is no L2 cache coherent bypass control at the ISA level.
1904       Changed |= enableGLCBit(MI);
1905       Changed |= enableDLCBit(MI);
1906       break;
1907     case SIAtomicScope::WORKGROUP:
1908       // In WGP mode the waves of a work-group can be executing on either CU of
1909       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1910       // CU mode all waves of a work-group are on the same CU, and so the L0
1911       // does not need to be bypassed.
1912       if (!ST.isCuModeEnabled())
1913         Changed |= enableGLCBit(MI);
1914       break;
1915     case SIAtomicScope::WAVEFRONT:
1916     case SIAtomicScope::SINGLETHREAD:
1917       // No cache to bypass.
1918       break;
1919     default:
1920       llvm_unreachable("Unsupported synchronization scope");
1921     }
1922   }
1923 
1924   /// The scratch address space does not need the global memory caches
1925   /// to be bypassed as all memory operations by the same thread are
1926   /// sequentially consistent, and no other thread can access scratch
1927   /// memory.
1928 
1929   /// Other address spaces do not have a cache.
1930 
1931   return Changed;
1932 }
1933 
1934 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1935     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1936     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1937 
1938   // Only handle load and store, not atomic read-modify-write insructions. The
1939   // latter use glc to indicate if the atomic returns a result and so must not
1940   // be used for cache control.
1941   assert(MI->mayLoad() ^ MI->mayStore());
1942 
1943   // Only update load and store, not LLVM IR atomic read-modify-write
1944   // instructions. The latter are always marked as volatile so cannot sensibly
1945   // handle it as do not want to pessimize all atomics. Also they do not support
1946   // the nontemporal attribute.
1947   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1948 
1949   bool Changed = false;
1950 
1951   if (IsVolatile) {
1952     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1953     // and MISS_LRU for store instructions.
1954     // Note: there is no L2 cache coherent bypass control at the ISA level.
1955     if (Op == SIMemOp::LOAD) {
1956       Changed |= enableGLCBit(MI);
1957       Changed |= enableDLCBit(MI);
1958     }
1959 
1960     // Ensure operation has completed at system scope to cause all volatile
1961     // operations to be visible outside the program in a global order. Do not
1962     // request cross address space as only the global address space can be
1963     // observable outside the program, so no need to cause a waitcnt for LDS
1964     // address space operations.
1965     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1966                           Position::AFTER);
1967     return Changed;
1968   }
1969 
1970   if (IsNonTemporal) {
1971     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1972     // and L2 cache policy to STREAM.
1973     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1974     // to MISS_EVICT and the L2 cache policy to STREAM.
1975     if (Op == SIMemOp::STORE)
1976       Changed |= enableGLCBit(MI);
1977     Changed |= enableSLCBit(MI);
1978 
1979     return Changed;
1980   }
1981 
1982   return Changed;
1983 }
1984 
1985 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1986                                      SIAtomicScope Scope,
1987                                      SIAtomicAddrSpace AddrSpace,
1988                                      SIMemOp Op,
1989                                      bool IsCrossAddrSpaceOrdering,
1990                                      Position Pos) const {
1991   bool Changed = false;
1992 
1993   MachineBasicBlock &MBB = *MI->getParent();
1994   DebugLoc DL = MI->getDebugLoc();
1995 
1996   if (Pos == Position::AFTER)
1997     ++MI;
1998 
1999   bool VMCnt = false;
2000   bool VSCnt = false;
2001   bool LGKMCnt = false;
2002 
2003   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2004       SIAtomicAddrSpace::NONE) {
2005     switch (Scope) {
2006     case SIAtomicScope::SYSTEM:
2007     case SIAtomicScope::AGENT:
2008       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2009         VMCnt |= true;
2010       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2011         VSCnt |= true;
2012       break;
2013     case SIAtomicScope::WORKGROUP:
2014       // In WGP mode the waves of a work-group can be executing on either CU of
2015       // the WGP. Therefore need to wait for operations to complete to ensure
2016       // they are visible to waves in the other CU as the L0 is per CU.
2017       // Otherwise in CU mode and all waves of a work-group are on the same CU
2018       // which shares the same L0.
2019       if (!ST.isCuModeEnabled()) {
2020         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2021           VMCnt |= true;
2022         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2023           VSCnt |= true;
2024       }
2025       break;
2026     case SIAtomicScope::WAVEFRONT:
2027     case SIAtomicScope::SINGLETHREAD:
2028       // The L0 cache keeps all memory operations in order for
2029       // work-items in the same wavefront.
2030       break;
2031     default:
2032       llvm_unreachable("Unsupported synchronization scope");
2033     }
2034   }
2035 
2036   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2037     switch (Scope) {
2038     case SIAtomicScope::SYSTEM:
2039     case SIAtomicScope::AGENT:
2040     case SIAtomicScope::WORKGROUP:
2041       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2042       // not needed as LDS operations for all waves are executed in a total
2043       // global ordering as observed by all waves. Required if also
2044       // synchronizing with global/GDS memory as LDS operations could be
2045       // reordered with respect to later global/GDS memory operations of the
2046       // same wave.
2047       LGKMCnt |= IsCrossAddrSpaceOrdering;
2048       break;
2049     case SIAtomicScope::WAVEFRONT:
2050     case SIAtomicScope::SINGLETHREAD:
2051       // The LDS keeps all memory operations in order for
2052       // the same wavefront.
2053       break;
2054     default:
2055       llvm_unreachable("Unsupported synchronization scope");
2056     }
2057   }
2058 
2059   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2060     switch (Scope) {
2061     case SIAtomicScope::SYSTEM:
2062     case SIAtomicScope::AGENT:
2063       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2064       // is not needed as GDS operations for all waves are executed in a total
2065       // global ordering as observed by all waves. Required if also
2066       // synchronizing with global/LDS memory as GDS operations could be
2067       // reordered with respect to later global/LDS memory operations of the
2068       // same wave.
2069       LGKMCnt |= IsCrossAddrSpaceOrdering;
2070       break;
2071     case SIAtomicScope::WORKGROUP:
2072     case SIAtomicScope::WAVEFRONT:
2073     case SIAtomicScope::SINGLETHREAD:
2074       // The GDS keeps all memory operations in order for
2075       // the same work-group.
2076       break;
2077     default:
2078       llvm_unreachable("Unsupported synchronization scope");
2079     }
2080   }
2081 
2082   if (VMCnt || LGKMCnt) {
2083     unsigned WaitCntImmediate =
2084       AMDGPU::encodeWaitcnt(IV,
2085                             VMCnt ? 0 : getVmcntBitMask(IV),
2086                             getExpcntBitMask(IV),
2087                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2088     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2089         .addImm(WaitCntImmediate);
2090     Changed = true;
2091   }
2092 
2093   if (VSCnt) {
2094     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2095         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2096         .addImm(0);
2097     Changed = true;
2098   }
2099 
2100   if (Pos == Position::AFTER)
2101     --MI;
2102 
2103   return Changed;
2104 }
2105 
2106 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2107                                         SIAtomicScope Scope,
2108                                         SIAtomicAddrSpace AddrSpace,
2109                                         Position Pos) const {
2110   if (!InsertCacheInv)
2111     return false;
2112 
2113   bool Changed = false;
2114 
2115   MachineBasicBlock &MBB = *MI->getParent();
2116   DebugLoc DL = MI->getDebugLoc();
2117 
2118   if (Pos == Position::AFTER)
2119     ++MI;
2120 
2121   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2122     switch (Scope) {
2123     case SIAtomicScope::SYSTEM:
2124     case SIAtomicScope::AGENT:
2125       // The order of invalidates matter here. We must invalidate "outer in"
2126       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2127       // invalidated.
2128       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2129       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2130       Changed = true;
2131       break;
2132     case SIAtomicScope::WORKGROUP:
2133       // In WGP mode the waves of a work-group can be executing on either CU of
2134       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2135       // in CU mode and all waves of a work-group are on the same CU, and so the
2136       // L0 does not need to be invalidated.
2137       if (!ST.isCuModeEnabled()) {
2138         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2139         Changed = true;
2140       }
2141       break;
2142     case SIAtomicScope::WAVEFRONT:
2143     case SIAtomicScope::SINGLETHREAD:
2144       // No cache to invalidate.
2145       break;
2146     default:
2147       llvm_unreachable("Unsupported synchronization scope");
2148     }
2149   }
2150 
2151   /// The scratch address space does not need the global memory cache
2152   /// to be flushed as all memory operations by the same thread are
2153   /// sequentially consistent, and no other thread can access scratch
2154   /// memory.
2155 
2156   /// Other address spaces do not have a cache.
2157 
2158   if (Pos == Position::AFTER)
2159     --MI;
2160 
2161   return Changed;
2162 }
2163 
2164 bool SIGfx11CacheControl::enableLoadCacheBypass(
2165     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2166     SIAtomicAddrSpace AddrSpace) const {
2167   assert(MI->mayLoad() && !MI->mayStore());
2168   bool Changed = false;
2169 
2170   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2171     switch (Scope) {
2172     case SIAtomicScope::SYSTEM:
2173     case SIAtomicScope::AGENT:
2174       // Set the L0 and L1 cache policies to MISS_EVICT.
2175       // Note: there is no L2 cache coherent bypass control at the ISA level.
2176       Changed |= enableGLCBit(MI);
2177       break;
2178     case SIAtomicScope::WORKGROUP:
2179       // In WGP mode the waves of a work-group can be executing on either CU of
2180       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2181       // CU mode all waves of a work-group are on the same CU, and so the L0
2182       // does not need to be bypassed.
2183       if (!ST.isCuModeEnabled())
2184         Changed |= enableGLCBit(MI);
2185       break;
2186     case SIAtomicScope::WAVEFRONT:
2187     case SIAtomicScope::SINGLETHREAD:
2188       // No cache to bypass.
2189       break;
2190     default:
2191       llvm_unreachable("Unsupported synchronization scope");
2192     }
2193   }
2194 
2195   /// The scratch address space does not need the global memory caches
2196   /// to be bypassed as all memory operations by the same thread are
2197   /// sequentially consistent, and no other thread can access scratch
2198   /// memory.
2199 
2200   /// Other address spaces do not have a cache.
2201 
2202   return Changed;
2203 }
2204 
2205 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2206     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2207     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2208 
2209   // Only handle load and store, not atomic read-modify-write insructions. The
2210   // latter use glc to indicate if the atomic returns a result and so must not
2211   // be used for cache control.
2212   assert(MI->mayLoad() ^ MI->mayStore());
2213 
2214   // Only update load and store, not LLVM IR atomic read-modify-write
2215   // instructions. The latter are always marked as volatile so cannot sensibly
2216   // handle it as do not want to pessimize all atomics. Also they do not support
2217   // the nontemporal attribute.
2218   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2219 
2220   bool Changed = false;
2221 
2222   if (IsVolatile) {
2223     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2224     // and MISS_LRU for store instructions.
2225     // Note: there is no L2 cache coherent bypass control at the ISA level.
2226     if (Op == SIMemOp::LOAD)
2227       Changed |= enableGLCBit(MI);
2228 
2229     // Set MALL NOALLOC for load and store instructions.
2230     Changed |= enableDLCBit(MI);
2231 
2232     // Ensure operation has completed at system scope to cause all volatile
2233     // operations to be visible outside the program in a global order. Do not
2234     // request cross address space as only the global address space can be
2235     // observable outside the program, so no need to cause a waitcnt for LDS
2236     // address space operations.
2237     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2238                           Position::AFTER);
2239     return Changed;
2240   }
2241 
2242   if (IsNonTemporal) {
2243     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2244     // and L2 cache policy to STREAM.
2245     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2246     // to MISS_EVICT and the L2 cache policy to STREAM.
2247     if (Op == SIMemOp::STORE)
2248       Changed |= enableGLCBit(MI);
2249     Changed |= enableSLCBit(MI);
2250 
2251     // Set MALL NOALLOC for load and store instructions.
2252     Changed |= enableDLCBit(MI);
2253     return Changed;
2254   }
2255 
2256   return Changed;
2257 }
2258 
2259 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2260                                 AMDGPU::CPol::CPol Value) const {
2261   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2262   if (!CPol)
2263     return false;
2264 
2265   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2266   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2267     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2268     return true;
2269   }
2270 
2271   return false;
2272 }
2273 
2274 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2275                                    AMDGPU::CPol::CPol Value) const {
2276   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2277   if (!CPol)
2278     return false;
2279 
2280   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2281   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2282     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2283     return true;
2284   }
2285 
2286   return false;
2287 }
2288 
2289 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2290     const MachineBasicBlock::iterator MI) const {
2291   // TODO: implement flag for frontend to give us a hint not to insert waits.
2292 
2293   MachineBasicBlock &MBB = *MI->getParent();
2294   const DebugLoc &DL = MI->getDebugLoc();
2295 
2296   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2297   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2298   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2299   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2300   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2301 
2302   return true;
2303 }
2304 
2305 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2306                                      SIAtomicScope Scope,
2307                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2308                                      bool IsCrossAddrSpaceOrdering,
2309                                      Position Pos) const {
2310   bool Changed = false;
2311 
2312   MachineBasicBlock &MBB = *MI->getParent();
2313   DebugLoc DL = MI->getDebugLoc();
2314 
2315   bool LOADCnt = false;
2316   bool DSCnt = false;
2317   bool STORECnt = false;
2318 
2319   if (Pos == Position::AFTER)
2320     ++MI;
2321 
2322   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2323       SIAtomicAddrSpace::NONE) {
2324     switch (Scope) {
2325     case SIAtomicScope::SYSTEM:
2326     case SIAtomicScope::AGENT:
2327       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2328         LOADCnt |= true;
2329       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2330         STORECnt |= true;
2331       break;
2332     case SIAtomicScope::WORKGROUP:
2333       // In WGP mode the waves of a work-group can be executing on either CU of
2334       // the WGP. Therefore need to wait for operations to complete to ensure
2335       // they are visible to waves in the other CU as the L0 is per CU.
2336       // Otherwise in CU mode and all waves of a work-group are on the same CU
2337       // which shares the same L0.
2338       if (!ST.isCuModeEnabled()) {
2339         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2340           LOADCnt |= true;
2341         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2342           STORECnt |= true;
2343       }
2344       break;
2345     case SIAtomicScope::WAVEFRONT:
2346     case SIAtomicScope::SINGLETHREAD:
2347       // The L0 cache keeps all memory operations in order for
2348       // work-items in the same wavefront.
2349       break;
2350     default:
2351       llvm_unreachable("Unsupported synchronization scope");
2352     }
2353   }
2354 
2355   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2356     switch (Scope) {
2357     case SIAtomicScope::SYSTEM:
2358     case SIAtomicScope::AGENT:
2359     case SIAtomicScope::WORKGROUP:
2360       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2361       // not needed as LDS operations for all waves are executed in a total
2362       // global ordering as observed by all waves. Required if also
2363       // synchronizing with global/GDS memory as LDS operations could be
2364       // reordered with respect to later global/GDS memory operations of the
2365       // same wave.
2366       DSCnt |= IsCrossAddrSpaceOrdering;
2367       break;
2368     case SIAtomicScope::WAVEFRONT:
2369     case SIAtomicScope::SINGLETHREAD:
2370       // The LDS keeps all memory operations in order for
2371       // the same wavefront.
2372       break;
2373     default:
2374       llvm_unreachable("Unsupported synchronization scope");
2375     }
2376   }
2377 
2378   if (LOADCnt) {
2379     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2380     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2381     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2382     Changed = true;
2383   }
2384 
2385   if (STORECnt) {
2386     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2387     Changed = true;
2388   }
2389 
2390   if (DSCnt) {
2391     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2392     Changed = true;
2393   }
2394 
2395   if (Pos == Position::AFTER)
2396     --MI;
2397 
2398   return Changed;
2399 }
2400 
2401 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2402                                         SIAtomicScope Scope,
2403                                         SIAtomicAddrSpace AddrSpace,
2404                                         Position Pos) const {
2405   if (!InsertCacheInv)
2406     return false;
2407 
2408   MachineBasicBlock &MBB = *MI->getParent();
2409   DebugLoc DL = MI->getDebugLoc();
2410 
2411   /// The scratch address space does not need the global memory cache
2412   /// to be flushed as all memory operations by the same thread are
2413   /// sequentially consistent, and no other thread can access scratch
2414   /// memory.
2415 
2416   /// Other address spaces do not have a cache.
2417   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2418     return false;
2419 
2420   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2421   switch (Scope) {
2422   case SIAtomicScope::SYSTEM:
2423     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2424     break;
2425   case SIAtomicScope::AGENT:
2426     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2427     break;
2428   case SIAtomicScope::WORKGROUP:
2429     // In WGP mode the waves of a work-group can be executing on either CU of
2430     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2431     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2432     // the L0 does not need to be invalidated.
2433     if (ST.isCuModeEnabled())
2434       return false;
2435 
2436     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2437     break;
2438   case SIAtomicScope::WAVEFRONT:
2439   case SIAtomicScope::SINGLETHREAD:
2440     // No cache to invalidate.
2441     return false;
2442   default:
2443     llvm_unreachable("Unsupported synchronization scope");
2444   }
2445 
2446   if (Pos == Position::AFTER)
2447     ++MI;
2448 
2449   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2450 
2451   if (Pos == Position::AFTER)
2452     --MI;
2453 
2454   return true;
2455 }
2456 
2457 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2458                                         SIAtomicScope Scope,
2459                                         SIAtomicAddrSpace AddrSpace,
2460                                         bool IsCrossAddrSpaceOrdering,
2461                                         Position Pos) const {
2462   MachineBasicBlock &MBB = *MI->getParent();
2463   DebugLoc DL = MI->getDebugLoc();
2464 
2465   // The scratch address space does not need the global memory cache
2466   // writeback as all memory operations by the same thread are
2467   // sequentially consistent, and no other thread can access scratch
2468   // memory.
2469 
2470   // Other address spaces do not have a cache.
2471   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2472     return false;
2473 
2474   if (Pos == Position::AFTER)
2475     ++MI;
2476 
2477   // GLOBAL_WB is always needed, even for write-through caches, as it
2478   // additionally ensures all operations have reached the desired cache level.
2479   bool SkipWB = false;
2480   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2481   switch (Scope) {
2482   case SIAtomicScope::SYSTEM:
2483     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2484     break;
2485   case SIAtomicScope::AGENT:
2486     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2487     break;
2488   case SIAtomicScope::WORKGROUP:
2489     // In WGP mode the waves of a work-group can be executing on either CU of
2490     // the WGP. Therefore we need to ensure all operations have reached L1,
2491     // hence the SCOPE_SE WB.
2492     // For CU mode, we need operations to reach L0, so the wait is enough -
2493     // there are no ways for an operation to report completion without reaching
2494     // at least L0.
2495     if (ST.isCuModeEnabled())
2496       SkipWB = true;
2497     else
2498       ScopeImm = AMDGPU::CPol::SCOPE_SE;
2499     break;
2500   case SIAtomicScope::WAVEFRONT:
2501   case SIAtomicScope::SINGLETHREAD:
2502     // No cache to invalidate.
2503     return false;
2504   default:
2505     llvm_unreachable("Unsupported synchronization scope");
2506   }
2507 
2508   if (!SkipWB)
2509     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2510 
2511   if (Pos == Position::AFTER)
2512     --MI;
2513 
2514   // We always have to wait for previous memory operations (load/store) to
2515   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2516   // we of course need to wait for that as well.
2517   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2518              IsCrossAddrSpaceOrdering, Pos);
2519 
2520   return true;
2521 }
2522 
2523 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2524     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2525     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2526 
2527   // Only handle load and store, not atomic read-modify-write instructions.
2528   assert(MI->mayLoad() ^ MI->mayStore());
2529 
2530   // Only update load and store, not LLVM IR atomic read-modify-write
2531   // instructions. The latter are always marked as volatile so cannot sensibly
2532   // handle it as do not want to pessimize all atomics. Also they do not support
2533   // the nontemporal attribute.
2534   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2535 
2536   bool Changed = false;
2537 
2538   if (IsLastUse) {
2539     // Set last-use hint.
2540     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2541   } else if (IsNonTemporal) {
2542     // Set non-temporal hint for all cache levels.
2543     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2544   }
2545 
2546   if (IsVolatile) {
2547     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2548 
2549     if (Op == SIMemOp::STORE)
2550       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2551 
2552     // Ensure operation has completed at system scope to cause all volatile
2553     // operations to be visible outside the program in a global order. Do not
2554     // request cross address space as only the global address space can be
2555     // observable outside the program, so no need to cause a waitcnt for LDS
2556     // address space operations.
2557     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2558                           Position::AFTER);
2559   }
2560 
2561   return Changed;
2562 }
2563 
2564 bool SIGfx12CacheControl::expandSystemScopeStore(
2565     MachineBasicBlock::iterator &MI) const {
2566   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2567   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2568     return insertWaitsBeforeSystemScopeStore(MI);
2569 
2570   return false;
2571 }
2572 
2573 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2574                                          SIAtomicScope Scope,
2575                                          SIAtomicAddrSpace AddrSpace) const {
2576   bool Changed = false;
2577 
2578   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2579     switch (Scope) {
2580     case SIAtomicScope::SYSTEM:
2581       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2582       break;
2583     case SIAtomicScope::AGENT:
2584       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2585       break;
2586     case SIAtomicScope::WORKGROUP:
2587       // In workgroup mode, SCOPE_SE is needed as waves can executes on
2588       // different CUs that access different L0s.
2589       if (!ST.isCuModeEnabled())
2590         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2591       break;
2592     case SIAtomicScope::WAVEFRONT:
2593     case SIAtomicScope::SINGLETHREAD:
2594       // No cache to bypass.
2595       break;
2596     default:
2597       llvm_unreachable("Unsupported synchronization scope");
2598     }
2599   }
2600 
2601   // The scratch address space does not need the global memory caches
2602   // to be bypassed as all memory operations by the same thread are
2603   // sequentially consistent, and no other thread can access scratch
2604   // memory.
2605 
2606   // Other address spaces do not have a cache.
2607 
2608   return Changed;
2609 }
2610 
2611 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2612   if (AtomicPseudoMIs.empty())
2613     return false;
2614 
2615   for (auto &MI : AtomicPseudoMIs)
2616     MI->eraseFromParent();
2617 
2618   AtomicPseudoMIs.clear();
2619   return true;
2620 }
2621 
2622 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2623                                    MachineBasicBlock::iterator &MI) {
2624   assert(MI->mayLoad() && !MI->mayStore());
2625 
2626   bool Changed = false;
2627 
2628   if (MOI.isAtomic()) {
2629     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2630         MOI.getOrdering() == AtomicOrdering::Acquire ||
2631         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2632       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2633                                            MOI.getOrderingAddrSpace());
2634     }
2635 
2636     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2637       Changed |= CC->insertWait(MI, MOI.getScope(),
2638                                 MOI.getOrderingAddrSpace(),
2639                                 SIMemOp::LOAD | SIMemOp::STORE,
2640                                 MOI.getIsCrossAddressSpaceOrdering(),
2641                                 Position::BEFORE);
2642 
2643     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2644         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2645       Changed |= CC->insertWait(MI, MOI.getScope(),
2646                                 MOI.getInstrAddrSpace(),
2647                                 SIMemOp::LOAD,
2648                                 MOI.getIsCrossAddressSpaceOrdering(),
2649                                 Position::AFTER);
2650       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2651                                    MOI.getOrderingAddrSpace(),
2652                                    Position::AFTER);
2653     }
2654 
2655     return Changed;
2656   }
2657 
2658   // Atomic instructions already bypass caches to the scope specified by the
2659   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2660   // instructions need additional treatment.
2661   Changed |= CC->enableVolatileAndOrNonTemporal(
2662       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2663       MOI.isNonTemporal(), MOI.isLastUse());
2664 
2665   return Changed;
2666 }
2667 
2668 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2669                                     MachineBasicBlock::iterator &MI) {
2670   assert(!MI->mayLoad() && MI->mayStore());
2671 
2672   bool Changed = false;
2673 
2674   if (MOI.isAtomic()) {
2675     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2676         MOI.getOrdering() == AtomicOrdering::Release ||
2677         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2678       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2679                                             MOI.getOrderingAddrSpace());
2680     }
2681 
2682     if (MOI.getOrdering() == AtomicOrdering::Release ||
2683         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2684       Changed |= CC->insertRelease(MI, MOI.getScope(),
2685                                    MOI.getOrderingAddrSpace(),
2686                                    MOI.getIsCrossAddressSpaceOrdering(),
2687                                    Position::BEFORE);
2688 
2689     return Changed;
2690   }
2691 
2692   // Atomic instructions already bypass caches to the scope specified by the
2693   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2694   // need additional treatment.
2695   Changed |= CC->enableVolatileAndOrNonTemporal(
2696       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2697       MOI.isNonTemporal());
2698 
2699   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2700   // instruction field, do not confuse it with atomic scope.
2701   Changed |= CC->expandSystemScopeStore(MI);
2702   return Changed;
2703 }
2704 
2705 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2706                                           MachineBasicBlock::iterator &MI) {
2707   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2708 
2709   AtomicPseudoMIs.push_back(MI);
2710   bool Changed = false;
2711 
2712   // Refine fenced address space based on MMRAs.
2713   //
2714   // TODO: Should we support this MMRA on other atomic operations?
2715   auto OrderingAddrSpace =
2716       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2717 
2718   if (MOI.isAtomic()) {
2719     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2720       Changed |= CC->insertWait(
2721           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2722           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
2723 
2724     if (MOI.getOrdering() == AtomicOrdering::Release ||
2725         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2726         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2727       /// TODO: This relies on a barrier always generating a waitcnt
2728       /// for LDS to ensure it is not reordered with the completion of
2729       /// the proceeding LDS operations. If barrier had a memory
2730       /// ordering and memory scope, then library does not need to
2731       /// generate a fence. Could add support in this file for
2732       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2733       /// adding S_WAITCNT before a S_BARRIER.
2734       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2735                                    MOI.getIsCrossAddressSpaceOrdering(),
2736                                    Position::BEFORE);
2737 
2738     // TODO: If both release and invalidate are happening they could be combined
2739     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2740     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2741     // track cache invalidate and write back instructions.
2742 
2743     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2744         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2745         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2746       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2747                                    Position::BEFORE);
2748 
2749     return Changed;
2750   }
2751 
2752   return Changed;
2753 }
2754 
2755 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2756   MachineBasicBlock::iterator &MI) {
2757   assert(MI->mayLoad() && MI->mayStore());
2758 
2759   bool Changed = false;
2760 
2761   if (MOI.isAtomic()) {
2762     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2763         MOI.getOrdering() == AtomicOrdering::Acquire ||
2764         MOI.getOrdering() == AtomicOrdering::Release ||
2765         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2766         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2767       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2768                                           MOI.getInstrAddrSpace());
2769     }
2770 
2771     if (MOI.getOrdering() == AtomicOrdering::Release ||
2772         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2773         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2774         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2775       Changed |= CC->insertRelease(MI, MOI.getScope(),
2776                                    MOI.getOrderingAddrSpace(),
2777                                    MOI.getIsCrossAddressSpaceOrdering(),
2778                                    Position::BEFORE);
2779 
2780     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2781         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2782         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2783         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2784         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2785       Changed |= CC->insertWait(MI, MOI.getScope(),
2786                                 MOI.getInstrAddrSpace(),
2787                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2788                                                    SIMemOp::STORE,
2789                                 MOI.getIsCrossAddressSpaceOrdering(),
2790                                 Position::AFTER);
2791       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2792                                    MOI.getOrderingAddrSpace(),
2793                                    Position::AFTER);
2794     }
2795 
2796     return Changed;
2797   }
2798 
2799   return Changed;
2800 }
2801 
2802 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2803   bool Changed = false;
2804 
2805   SIMemOpAccess MOA(MF);
2806   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2807 
2808   for (auto &MBB : MF) {
2809     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2810 
2811       // Unbundle instructions after the post-RA scheduler.
2812       if (MI->isBundle() && MI->mayLoadOrStore()) {
2813         MachineBasicBlock::instr_iterator II(MI->getIterator());
2814         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2815              I != E && I->isBundledWithPred(); ++I) {
2816           I->unbundleFromPred();
2817           for (MachineOperand &MO : I->operands())
2818             if (MO.isReg())
2819               MO.setIsInternalRead(false);
2820         }
2821 
2822         MI->eraseFromParent();
2823         MI = II->getIterator();
2824       }
2825 
2826       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2827         continue;
2828 
2829       if (const auto &MOI = MOA.getLoadInfo(MI))
2830         Changed |= expandLoad(*MOI, MI);
2831       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2832         Changed |= expandStore(*MOI, MI);
2833         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2834       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2835         Changed |= expandAtomicFence(*MOI, MI);
2836       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2837         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2838     }
2839   }
2840 
2841   Changed |= removeAtomicPseudoMIs();
2842   return Changed;
2843 }
2844 
2845 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2846 
2847 char SIMemoryLegalizer::ID = 0;
2848 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2849 
2850 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2851   return new SIMemoryLegalizer();
2852 }
2853