xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 924a64a3486f9962c42d4ec253774eb2c586ac33)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/IR/DiagnosticInfo.h"
25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26 #include "llvm/Support/AtomicOrdering.h"
27 #include "llvm/TargetParser/TargetParser.h"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 #define DEBUG_TYPE "si-memory-legalizer"
33 #define PASS_NAME "SI Memory Legalizer"
34 
35 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37     cl::desc("Use this to skip inserting cache invalidating instructions."));
38 
39 namespace {
40 
41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42 
43 /// Memory operation flags. Can be ORed together.
44 enum class SIMemOp {
45   NONE = 0u,
46   LOAD = 1u << 0,
47   STORE = 1u << 1,
48   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49 };
50 
51 /// Position to insert a new instruction relative to an existing
52 /// instruction.
53 enum class Position {
54   BEFORE,
55   AFTER
56 };
57 
58 /// The atomic synchronization scopes supported by the AMDGPU target.
59 enum class SIAtomicScope {
60   NONE,
61   SINGLETHREAD,
62   WAVEFRONT,
63   WORKGROUP,
64   AGENT,
65   SYSTEM
66 };
67 
68 /// The distinct address spaces supported by the AMDGPU target for
69 /// atomic memory operation. Can be ORed together.
70 enum class SIAtomicAddrSpace {
71   NONE = 0u,
72   GLOBAL = 1u << 0,
73   LDS = 1u << 1,
74   SCRATCH = 1u << 2,
75   GDS = 1u << 3,
76   OTHER = 1u << 4,
77 
78   /// The address spaces that can be accessed by a FLAT instruction.
79   FLAT = GLOBAL | LDS | SCRATCH,
80 
81   /// The address spaces that support atomic instructions.
82   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83 
84   /// All address spaces.
85   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86 
87   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88 };
89 
90 class SIMemOpInfo final {
91 private:
92 
93   friend class SIMemOpAccess;
94 
95   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100   bool IsCrossAddressSpaceOrdering = false;
101   bool IsVolatile = false;
102   bool IsNonTemporal = false;
103   bool IsLastUse = false;
104 
105   SIMemOpInfo(
106       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110       bool IsCrossAddressSpaceOrdering = true,
111       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112       bool IsVolatile = false, bool IsNonTemporal = false,
113       bool IsLastUse = false)
114       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118         IsLastUse(IsLastUse) {
119 
120     if (Ordering == AtomicOrdering::NotAtomic) {
121       assert(Scope == SIAtomicScope::NONE &&
122              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123              !IsCrossAddressSpaceOrdering &&
124              FailureOrdering == AtomicOrdering::NotAtomic);
125       return;
126     }
127 
128     assert(Scope != SIAtomicScope::NONE &&
129            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE &&
131            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132                SIAtomicAddrSpace::NONE);
133 
134     // There is also no cross address space ordering if the ordering
135     // address space is the same as the instruction address space and
136     // only contains a single address space.
137     if ((OrderingAddrSpace == InstrAddrSpace) &&
138         isPowerOf2_32(uint32_t(InstrAddrSpace)))
139       this->IsCrossAddressSpaceOrdering = false;
140 
141     // Limit the scope to the maximum supported by the instruction's address
142     // spaces.
143     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144         SIAtomicAddrSpace::NONE) {
145       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146     } else if ((InstrAddrSpace &
147                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148                SIAtomicAddrSpace::NONE) {
149       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150     } else if ((InstrAddrSpace &
151                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154     }
155   }
156 
157 public:
158   /// \returns Atomic synchronization scope of the machine instruction used to
159   /// create this SIMemOpInfo.
160   SIAtomicScope getScope() const {
161     return Scope;
162   }
163 
164   /// \returns Ordering constraint of the machine instruction used to
165   /// create this SIMemOpInfo.
166   AtomicOrdering getOrdering() const {
167     return Ordering;
168   }
169 
170   /// \returns Failure ordering constraint of the machine instruction used to
171   /// create this SIMemOpInfo.
172   AtomicOrdering getFailureOrdering() const {
173     return FailureOrdering;
174   }
175 
176   /// \returns The address spaces be accessed by the machine
177   /// instruction used to create this SIMemOpInfo.
178   SIAtomicAddrSpace getInstrAddrSpace() const {
179     return InstrAddrSpace;
180   }
181 
182   /// \returns The address spaces that must be ordered by the machine
183   /// instruction used to create this SIMemOpInfo.
184   SIAtomicAddrSpace getOrderingAddrSpace() const {
185     return OrderingAddrSpace;
186   }
187 
188   /// \returns Return true iff memory ordering of operations on
189   /// different address spaces is required.
190   bool getIsCrossAddressSpaceOrdering() const {
191     return IsCrossAddressSpaceOrdering;
192   }
193 
194   /// \returns True if memory access of the machine instruction used to
195   /// create this SIMemOpInfo is volatile, false otherwise.
196   bool isVolatile() const {
197     return IsVolatile;
198   }
199 
200   /// \returns True if memory access of the machine instruction used to
201   /// create this SIMemOpInfo is nontemporal, false otherwise.
202   bool isNonTemporal() const {
203     return IsNonTemporal;
204   }
205 
206   /// \returns True if memory access of the machine instruction used to
207   /// create this SIMemOpInfo is last use, false otherwise.
208   bool isLastUse() const { return IsLastUse; }
209 
210   /// \returns True if ordering constraint of the machine instruction used to
211   /// create this SIMemOpInfo is unordered or higher, false otherwise.
212   bool isAtomic() const {
213     return Ordering != AtomicOrdering::NotAtomic;
214   }
215 
216 };
217 
218 class SIMemOpAccess final {
219 private:
220   const AMDGPUMachineModuleInfo *MMI = nullptr;
221 
222   /// Reports unsupported message \p Msg for \p MI to LLVM context.
223   void reportUnsupported(const MachineBasicBlock::iterator &MI,
224                          const char *Msg) const;
225 
226   /// Inspects the target synchronization scope \p SSID and determines
227   /// the SI atomic scope it corresponds to, the address spaces it
228   /// covers, and whether the memory ordering applies between address
229   /// spaces.
230   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232 
233   /// \return Return a bit set of the address spaces accessed by \p AS.
234   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235 
236   /// \returns Info constructed from \p MI, which has at least machine memory
237   /// operand.
238   std::optional<SIMemOpInfo>
239   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240 
241 public:
242   /// Construct class to support accessing the machine memory operands
243   /// of instructions in the machine function \p MF.
244   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
245 
246   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247   std::optional<SIMemOpInfo>
248   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249 
250   /// \returns Store info if \p MI is a store operation, "std::nullopt"
251   /// otherwise.
252   std::optional<SIMemOpInfo>
253   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254 
255   /// \returns Atomic fence info if \p MI is an atomic fence operation,
256   /// "std::nullopt" otherwise.
257   std::optional<SIMemOpInfo>
258   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261   /// rmw operation, "std::nullopt" otherwise.
262   std::optional<SIMemOpInfo>
263   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264 };
265 
266 class SICacheControl {
267 protected:
268 
269   /// AMDGPU subtarget info.
270   const GCNSubtarget &ST;
271 
272   /// Instruction info.
273   const SIInstrInfo *TII = nullptr;
274 
275   IsaVersion IV;
276 
277   /// Whether to insert cache invalidating instructions.
278   bool InsertCacheInv;
279 
280   SICacheControl(const GCNSubtarget &ST);
281 
282   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283   /// \returns Returns true if \p MI is modified, false otherwise.
284   bool enableNamedBit(const MachineBasicBlock::iterator MI,
285                       AMDGPU::CPol::CPol Bit) const;
286 
287 public:
288 
289   /// Create a cache control for the subtarget \p ST.
290   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291 
292   /// Update \p MI memory load instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296                                      SIAtomicScope Scope,
297                                      SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory store instruction to bypass any caches up to
300   /// the \p Scope memory scope for address spaces \p
301   /// AddrSpace. Return true iff the instruction was modified.
302   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303                                       SIAtomicScope Scope,
304                                       SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory read-modify-write instruction to bypass any caches up
307   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308   /// iff the instruction was modified.
309   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310                                     SIAtomicScope Scope,
311                                     SIAtomicAddrSpace AddrSpace) const = 0;
312 
313   /// Update \p MI memory instruction of kind \p Op associated with address
314   /// spaces \p AddrSpace to indicate it is volatile and/or
315   /// nontemporal/last-use. Return true iff the instruction was modified.
316   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317                                               SIAtomicAddrSpace AddrSpace,
318                                               SIMemOp Op, bool IsVolatile,
319                                               bool IsNonTemporal,
320                                               bool IsLastUse = false) const = 0;
321 
322   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323     return false;
324   };
325 
326   /// Inserts any necessary instructions at position \p Pos relative
327   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328   /// \p Op associated with address spaces \p AddrSpace have completed. Used
329   /// between memory instructions to enforce the order they become visible as
330   /// observed by other memory instructions executing in memory scope \p Scope.
331   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332   /// address spaces. Returns true iff any instructions inserted.
333   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
334                           SIAtomicAddrSpace AddrSpace, SIMemOp Op,
335                           bool IsCrossAddrSpaceOrdering, Position Pos,
336                           AtomicOrdering Order) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure any subsequent memory instructions of this
340   /// thread with address spaces \p AddrSpace will observe the previous memory
341   /// operations by any thread for memory scopes up to memory scope \p Scope .
342   /// Returns true iff any instructions inserted.
343   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              Position Pos) const = 0;
347 
348   /// Inserts any necessary instructions at position \p Pos relative to
349   /// instruction \p MI to ensure previous memory instructions by this thread
350   /// with address spaces \p AddrSpace have completed and can be observed by
351   /// subsequent memory instructions by any thread executing in memory scope \p
352   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
353   /// between address spaces. Returns true iff any instructions inserted.
354   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
355                              SIAtomicScope Scope,
356                              SIAtomicAddrSpace AddrSpace,
357                              bool IsCrossAddrSpaceOrdering,
358                              Position Pos) const = 0;
359 
360   /// Virtual destructor to allow derivations to be deleted.
361   virtual ~SICacheControl() = default;
362 
363   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
364                                    MachineBasicBlock::iterator &MI) const {
365     return false;
366   }
367 };
368 
369 class SIGfx6CacheControl : public SICacheControl {
370 protected:
371 
372   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
373   /// is modified, false otherwise.
374   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
375     return enableNamedBit(MI, AMDGPU::CPol::GLC);
376   }
377 
378   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
379   /// is modified, false otherwise.
380   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
381     return enableNamedBit(MI, AMDGPU::CPol::SLC);
382   }
383 
384 public:
385 
386   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
387 
388   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
389                              SIAtomicScope Scope,
390                              SIAtomicAddrSpace AddrSpace) const override;
391 
392   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
393                               SIAtomicScope Scope,
394                               SIAtomicAddrSpace AddrSpace) const override;
395 
396   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
397                             SIAtomicScope Scope,
398                             SIAtomicAddrSpace AddrSpace) const override;
399 
400   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
402                                       bool IsVolatile, bool IsNonTemporal,
403                                       bool IsLastUse) const override;
404 
405   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
406                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
407                   bool IsCrossAddrSpaceOrdering, Position Pos,
408                   AtomicOrdering Order) const override;
409 
410   bool insertAcquire(MachineBasicBlock::iterator &MI,
411                      SIAtomicScope Scope,
412                      SIAtomicAddrSpace AddrSpace,
413                      Position Pos) const override;
414 
415   bool insertRelease(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      bool IsCrossAddrSpaceOrdering,
419                      Position Pos) const override;
420 };
421 
422 class SIGfx7CacheControl : public SIGfx6CacheControl {
423 public:
424 
425   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
426 
427   bool insertAcquire(MachineBasicBlock::iterator &MI,
428                      SIAtomicScope Scope,
429                      SIAtomicAddrSpace AddrSpace,
430                      Position Pos) const override;
431 
432 };
433 
434 class SIGfx90ACacheControl : public SIGfx7CacheControl {
435 public:
436 
437   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
438 
439   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
440                              SIAtomicScope Scope,
441                              SIAtomicAddrSpace AddrSpace) const override;
442 
443   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
444                               SIAtomicScope Scope,
445                               SIAtomicAddrSpace AddrSpace) const override;
446 
447   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
448                             SIAtomicScope Scope,
449                             SIAtomicAddrSpace AddrSpace) const override;
450 
451   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
452                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
453                                       bool IsVolatile, bool IsNonTemporal,
454                                       bool IsLastUse) const override;
455 
456   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
457                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
458                   bool IsCrossAddrSpaceOrdering, Position Pos,
459                   AtomicOrdering Order) const override;
460 
461   bool insertAcquire(MachineBasicBlock::iterator &MI,
462                      SIAtomicScope Scope,
463                      SIAtomicAddrSpace AddrSpace,
464                      Position Pos) const override;
465 
466   bool insertRelease(MachineBasicBlock::iterator &MI,
467                      SIAtomicScope Scope,
468                      SIAtomicAddrSpace AddrSpace,
469                      bool IsCrossAddrSpaceOrdering,
470                      Position Pos) const override;
471 };
472 
473 class SIGfx940CacheControl : public SIGfx90ACacheControl {
474 protected:
475 
476   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
477   /// is modified, false otherwise.
478   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
479     return enableNamedBit(MI, AMDGPU::CPol::SC0);
480   }
481 
482   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
483   /// is modified, false otherwise.
484   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
485     return enableNamedBit(MI, AMDGPU::CPol::SC1);
486   }
487 
488   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
489   /// is modified, false otherwise.
490   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
491     return enableNamedBit(MI, AMDGPU::CPol::NT);
492   }
493 
494 public:
495 
496   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
497 
498   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
499                              SIAtomicScope Scope,
500                              SIAtomicAddrSpace AddrSpace) const override;
501 
502   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
503                               SIAtomicScope Scope,
504                               SIAtomicAddrSpace AddrSpace) const override;
505 
506   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
507                             SIAtomicScope Scope,
508                             SIAtomicAddrSpace AddrSpace) const override;
509 
510   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
511                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
512                                       bool IsVolatile, bool IsNonTemporal,
513                                       bool IsLastUse) const override;
514 
515   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
516                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
517 
518   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
519                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
520                      Position Pos) const override;
521 
522   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
523                            MachineBasicBlock::iterator &MI) const override {
524     bool Changed = false;
525     if (ST.hasForceStoreSC0SC1() &&
526         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
527                                     SIAtomicAddrSpace::GLOBAL |
528                                     SIAtomicAddrSpace::OTHER)) !=
529          SIAtomicAddrSpace::NONE) {
530       Changed |= enableSC0Bit(MI);
531       Changed |= enableSC1Bit(MI);
532     }
533     return Changed;
534   }
535 };
536 
537 class SIGfx10CacheControl : public SIGfx7CacheControl {
538 protected:
539 
540   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
541   /// is modified, false otherwise.
542   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
543     return enableNamedBit(MI, AMDGPU::CPol::DLC);
544   }
545 
546 public:
547 
548   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
549 
550   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
551                              SIAtomicScope Scope,
552                              SIAtomicAddrSpace AddrSpace) const override;
553 
554   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
555                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
556                                       bool IsVolatile, bool IsNonTemporal,
557                                       bool IsLastUse) const override;
558 
559   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
560                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
561                   bool IsCrossAddrSpaceOrdering, Position Pos,
562                   AtomicOrdering Order) const override;
563 
564   bool insertAcquire(MachineBasicBlock::iterator &MI,
565                      SIAtomicScope Scope,
566                      SIAtomicAddrSpace AddrSpace,
567                      Position Pos) const override;
568 };
569 
570 class SIGfx11CacheControl : public SIGfx10CacheControl {
571 public:
572   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
573 
574   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
575                              SIAtomicScope Scope,
576                              SIAtomicAddrSpace AddrSpace) const override;
577 
578   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
579                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
580                                       bool IsVolatile, bool IsNonTemporal,
581                                       bool IsLastUse) const override;
582 };
583 
584 class SIGfx12CacheControl : public SIGfx11CacheControl {
585 protected:
586   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
587   // \returns Returns true if \p MI is modified, false otherwise.
588   bool setTH(const MachineBasicBlock::iterator MI,
589              AMDGPU::CPol::CPol Value) const;
590   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
591   // MI. \returns Returns true if \p MI is modified, false otherwise.
592   bool setScope(const MachineBasicBlock::iterator MI,
593                 AMDGPU::CPol::CPol Value) const;
594 
595   // Stores with system scope (SCOPE_SYS) need to wait for:
596   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
597   // - non-returning-atomics       - wait for STORECNT==0
598   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
599   //   since it does not distinguish atomics-with-return from regular stores.
600   // There is no need to wait if memory is cached (mtype != UC).
601   bool
602   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
603 
604   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
605                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
606 
607 public:
608   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
609 
610   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
611                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
612                   bool IsCrossAddrSpaceOrdering, Position Pos,
613                   AtomicOrdering Order) const override;
614 
615   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
616                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
617 
618   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
619                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620                                       bool IsVolatile, bool IsNonTemporal,
621                                       bool IsLastUse) const override;
622 
623   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
624 
625   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
626                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
627                      Position Pos) const override;
628 
629   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
630                              SIAtomicScope Scope,
631                              SIAtomicAddrSpace AddrSpace) const override {
632     return setAtomicScope(MI, Scope, AddrSpace);
633   }
634 
635   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
636                               SIAtomicScope Scope,
637                               SIAtomicAddrSpace AddrSpace) const override {
638     return setAtomicScope(MI, Scope, AddrSpace);
639   }
640 
641   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
642                             SIAtomicScope Scope,
643                             SIAtomicAddrSpace AddrSpace) const override {
644     return setAtomicScope(MI, Scope, AddrSpace);
645   }
646 };
647 
648 class SIMemoryLegalizer final : public MachineFunctionPass {
649 private:
650 
651   /// Cache Control.
652   std::unique_ptr<SICacheControl> CC = nullptr;
653 
654   /// List of atomic pseudo instructions.
655   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
656 
657   /// Return true iff instruction \p MI is a atomic instruction that
658   /// returns a result.
659   bool isAtomicRet(const MachineInstr &MI) const {
660     return SIInstrInfo::isAtomicRet(MI);
661   }
662 
663   /// Removes all processed atomic pseudo instructions from the current
664   /// function. Returns true if current function is modified, false otherwise.
665   bool removeAtomicPseudoMIs();
666 
667   /// Expands load operation \p MI. Returns true if instructions are
668   /// added/deleted or \p MI is modified, false otherwise.
669   bool expandLoad(const SIMemOpInfo &MOI,
670                   MachineBasicBlock::iterator &MI);
671   /// Expands store operation \p MI. Returns true if instructions are
672   /// added/deleted or \p MI is modified, false otherwise.
673   bool expandStore(const SIMemOpInfo &MOI,
674                    MachineBasicBlock::iterator &MI);
675   /// Expands atomic fence operation \p MI. Returns true if
676   /// instructions are added/deleted or \p MI is modified, false otherwise.
677   bool expandAtomicFence(const SIMemOpInfo &MOI,
678                          MachineBasicBlock::iterator &MI);
679   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
680   /// instructions are added/deleted or \p MI is modified, false otherwise.
681   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
682                                 MachineBasicBlock::iterator &MI);
683 
684 public:
685   static char ID;
686 
687   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
688 
689   void getAnalysisUsage(AnalysisUsage &AU) const override {
690     AU.setPreservesCFG();
691     MachineFunctionPass::getAnalysisUsage(AU);
692   }
693 
694   StringRef getPassName() const override {
695     return PASS_NAME;
696   }
697 
698   bool runOnMachineFunction(MachineFunction &MF) override;
699 };
700 
701 static const StringMap<SIAtomicAddrSpace> ASNames = {{
702     {"global", SIAtomicAddrSpace::GLOBAL},
703     {"local", SIAtomicAddrSpace::LDS},
704 }};
705 
706 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
707   const MachineFunction *MF = MI.getMF();
708   const Function &Fn = MF->getFunction();
709   SmallString<128> Str;
710   raw_svector_ostream OS(Str);
711   OS << "unknown address space '" << AS << "'; expected one of ";
712   ListSeparator LS;
713   for (const auto &[Name, Val] : ASNames)
714     OS << LS << '\'' << Name << '\'';
715   DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
716   Fn.getContext().diagnose(BadTag);
717 }
718 
719 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
720 /// If this tag isn't present, or if it has no meaningful values, returns \p
721 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
722 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
723                                                SIAtomicAddrSpace Default) {
724   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
725 
726   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
727   if (!MMRA)
728     return Default;
729 
730   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
731   for (const auto &[Prefix, Suffix] : MMRA) {
732     if (Prefix != FenceASPrefix)
733       continue;
734 
735     if (auto It = ASNames.find(Suffix); It != ASNames.end())
736       Result |= It->second;
737     else
738       diagnoseUnknownMMRAASName(MI, Suffix);
739   }
740 
741   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
742 }
743 
744 } // end anonymous namespace
745 
746 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
747                                       const char *Msg) const {
748   const Function &Func = MI->getParent()->getParent()->getFunction();
749   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
750   Func.getContext().diagnose(Diag);
751 }
752 
753 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
754 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
755                                SIAtomicAddrSpace InstrAddrSpace) const {
756   if (SSID == SyncScope::System)
757     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
758   if (SSID == MMI->getAgentSSID())
759     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
760   if (SSID == MMI->getWorkgroupSSID())
761     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
762                       true);
763   if (SSID == MMI->getWavefrontSSID())
764     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
765                       true);
766   if (SSID == SyncScope::SingleThread)
767     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
768                       true);
769   if (SSID == MMI->getSystemOneAddressSpaceSSID())
770     return std::tuple(SIAtomicScope::SYSTEM,
771                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772   if (SSID == MMI->getAgentOneAddressSpaceSSID())
773     return std::tuple(SIAtomicScope::AGENT,
774                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
775   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
776     return std::tuple(SIAtomicScope::WORKGROUP,
777                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
778   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
779     return std::tuple(SIAtomicScope::WAVEFRONT,
780                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
781   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
782     return std::tuple(SIAtomicScope::SINGLETHREAD,
783                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
784   return std::nullopt;
785 }
786 
787 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
788   if (AS == AMDGPUAS::FLAT_ADDRESS)
789     return SIAtomicAddrSpace::FLAT;
790   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
791     return SIAtomicAddrSpace::GLOBAL;
792   if (AS == AMDGPUAS::LOCAL_ADDRESS)
793     return SIAtomicAddrSpace::LDS;
794   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
795     return SIAtomicAddrSpace::SCRATCH;
796   if (AS == AMDGPUAS::REGION_ADDRESS)
797     return SIAtomicAddrSpace::GDS;
798 
799   return SIAtomicAddrSpace::OTHER;
800 }
801 
802 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
803     : MMI(&MMI_) {}
804 
805 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
806     const MachineBasicBlock::iterator &MI) const {
807   assert(MI->getNumMemOperands() > 0);
808 
809   SyncScope::ID SSID = SyncScope::SingleThread;
810   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
811   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
812   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
813   bool IsNonTemporal = true;
814   bool IsVolatile = false;
815   bool IsLastUse = false;
816 
817   // Validator should check whether or not MMOs cover the entire set of
818   // locations accessed by the memory instruction.
819   for (const auto &MMO : MI->memoperands()) {
820     IsNonTemporal &= MMO->isNonTemporal();
821     IsVolatile |= MMO->isVolatile();
822     IsLastUse |= MMO->getFlags() & MOLastUse;
823     InstrAddrSpace |=
824       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
825     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
826     if (OpOrdering != AtomicOrdering::NotAtomic) {
827       const auto &IsSyncScopeInclusion =
828           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
829       if (!IsSyncScopeInclusion) {
830         reportUnsupported(MI,
831           "Unsupported non-inclusive atomic synchronization scope");
832         return std::nullopt;
833       }
834 
835       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
836       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
837       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
838              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
839       FailureOrdering =
840           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
841     }
842   }
843 
844   SIAtomicScope Scope = SIAtomicScope::NONE;
845   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
846   bool IsCrossAddressSpaceOrdering = false;
847   if (Ordering != AtomicOrdering::NotAtomic) {
848     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
849     if (!ScopeOrNone) {
850       reportUnsupported(MI, "Unsupported atomic synchronization scope");
851       return std::nullopt;
852     }
853     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
854         *ScopeOrNone;
855     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
856         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
857         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
858       reportUnsupported(MI, "Unsupported atomic address space");
859       return std::nullopt;
860     }
861   }
862   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
863                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
864                      IsNonTemporal, IsLastUse);
865 }
866 
867 std::optional<SIMemOpInfo>
868 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
869   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
870 
871   if (!(MI->mayLoad() && !MI->mayStore()))
872     return std::nullopt;
873 
874   // Be conservative if there are no memory operands.
875   if (MI->getNumMemOperands() == 0)
876     return SIMemOpInfo();
877 
878   return constructFromMIWithMMO(MI);
879 }
880 
881 std::optional<SIMemOpInfo>
882 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
883   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
884 
885   if (!(!MI->mayLoad() && MI->mayStore()))
886     return std::nullopt;
887 
888   // Be conservative if there are no memory operands.
889   if (MI->getNumMemOperands() == 0)
890     return SIMemOpInfo();
891 
892   return constructFromMIWithMMO(MI);
893 }
894 
895 std::optional<SIMemOpInfo>
896 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
897   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
898 
899   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
900     return std::nullopt;
901 
902   AtomicOrdering Ordering =
903     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
904 
905   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
906   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
907   if (!ScopeOrNone) {
908     reportUnsupported(MI, "Unsupported atomic synchronization scope");
909     return std::nullopt;
910   }
911 
912   SIAtomicScope Scope = SIAtomicScope::NONE;
913   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
914   bool IsCrossAddressSpaceOrdering = false;
915   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
916       *ScopeOrNone;
917 
918   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
919       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
920     reportUnsupported(MI, "Unsupported atomic address space");
921     return std::nullopt;
922   }
923 
924   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
925                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
926 }
927 
928 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
929     const MachineBasicBlock::iterator &MI) const {
930   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
931 
932   if (!(MI->mayLoad() && MI->mayStore()))
933     return std::nullopt;
934 
935   // Be conservative if there are no memory operands.
936   if (MI->getNumMemOperands() == 0)
937     return SIMemOpInfo();
938 
939   return constructFromMIWithMMO(MI);
940 }
941 
942 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
943   TII = ST.getInstrInfo();
944   IV = getIsaVersion(ST.getCPU());
945   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
946 }
947 
948 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
949                                     AMDGPU::CPol::CPol Bit) const {
950   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
951   if (!CPol)
952     return false;
953 
954   CPol->setImm(CPol->getImm() | Bit);
955   return true;
956 }
957 
958 /* static */
959 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
960   GCNSubtarget::Generation Generation = ST.getGeneration();
961   if (ST.hasGFX940Insts())
962     return std::make_unique<SIGfx940CacheControl>(ST);
963   if (ST.hasGFX90AInsts())
964     return std::make_unique<SIGfx90ACacheControl>(ST);
965   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
966     return std::make_unique<SIGfx6CacheControl>(ST);
967   if (Generation < AMDGPUSubtarget::GFX10)
968     return std::make_unique<SIGfx7CacheControl>(ST);
969   if (Generation < AMDGPUSubtarget::GFX11)
970     return std::make_unique<SIGfx10CacheControl>(ST);
971   if (Generation < AMDGPUSubtarget::GFX12)
972     return std::make_unique<SIGfx11CacheControl>(ST);
973   return std::make_unique<SIGfx12CacheControl>(ST);
974 }
975 
976 bool SIGfx6CacheControl::enableLoadCacheBypass(
977     const MachineBasicBlock::iterator &MI,
978     SIAtomicScope Scope,
979     SIAtomicAddrSpace AddrSpace) const {
980   assert(MI->mayLoad() && !MI->mayStore());
981   bool Changed = false;
982 
983   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
984     switch (Scope) {
985     case SIAtomicScope::SYSTEM:
986     case SIAtomicScope::AGENT:
987       // Set L1 cache policy to MISS_EVICT.
988       // Note: there is no L2 cache bypass policy at the ISA level.
989       Changed |= enableGLCBit(MI);
990       break;
991     case SIAtomicScope::WORKGROUP:
992     case SIAtomicScope::WAVEFRONT:
993     case SIAtomicScope::SINGLETHREAD:
994       // No cache to bypass.
995       break;
996     default:
997       llvm_unreachable("Unsupported synchronization scope");
998     }
999   }
1000 
1001   /// The scratch address space does not need the global memory caches
1002   /// to be bypassed as all memory operations by the same thread are
1003   /// sequentially consistent, and no other thread can access scratch
1004   /// memory.
1005 
1006   /// Other address spaces do not have a cache.
1007 
1008   return Changed;
1009 }
1010 
1011 bool SIGfx6CacheControl::enableStoreCacheBypass(
1012     const MachineBasicBlock::iterator &MI,
1013     SIAtomicScope Scope,
1014     SIAtomicAddrSpace AddrSpace) const {
1015   assert(!MI->mayLoad() && MI->mayStore());
1016   bool Changed = false;
1017 
1018   /// The L1 cache is write through so does not need to be bypassed. There is no
1019   /// bypass control for the L2 cache at the isa level.
1020 
1021   return Changed;
1022 }
1023 
1024 bool SIGfx6CacheControl::enableRMWCacheBypass(
1025     const MachineBasicBlock::iterator &MI,
1026     SIAtomicScope Scope,
1027     SIAtomicAddrSpace AddrSpace) const {
1028   assert(MI->mayLoad() && MI->mayStore());
1029   bool Changed = false;
1030 
1031   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1032   /// bypassed, and the GLC bit is instead used to indicate if they are
1033   /// return or no-return.
1034   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1035 
1036   return Changed;
1037 }
1038 
1039 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1040     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1041     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1042   // Only handle load and store, not atomic read-modify-write insructions. The
1043   // latter use glc to indicate if the atomic returns a result and so must not
1044   // be used for cache control.
1045   assert(MI->mayLoad() ^ MI->mayStore());
1046 
1047   // Only update load and store, not LLVM IR atomic read-modify-write
1048   // instructions. The latter are always marked as volatile so cannot sensibly
1049   // handle it as do not want to pessimize all atomics. Also they do not support
1050   // the nontemporal attribute.
1051   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1052 
1053   bool Changed = false;
1054 
1055   if (IsVolatile) {
1056     // Set L1 cache policy to be MISS_EVICT for load instructions
1057     // and MISS_LRU for store instructions.
1058     // Note: there is no L2 cache bypass policy at the ISA level.
1059     if (Op == SIMemOp::LOAD)
1060       Changed |= enableGLCBit(MI);
1061 
1062     // Ensure operation has completed at system scope to cause all volatile
1063     // operations to be visible outside the program in a global order. Do not
1064     // request cross address space as only the global address space can be
1065     // observable outside the program, so no need to cause a waitcnt for LDS
1066     // address space operations.
1067     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1068                           Position::AFTER, AtomicOrdering::Unordered);
1069 
1070     return Changed;
1071   }
1072 
1073   if (IsNonTemporal) {
1074     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1075     // for both loads and stores, and the L2 cache policy to STREAM.
1076     Changed |= enableGLCBit(MI);
1077     Changed |= enableSLCBit(MI);
1078     return Changed;
1079   }
1080 
1081   return Changed;
1082 }
1083 
1084 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1085                                     SIAtomicScope Scope,
1086                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1087                                     bool IsCrossAddrSpaceOrdering, Position Pos,
1088                                     AtomicOrdering Order) const {
1089   bool Changed = false;
1090 
1091   MachineBasicBlock &MBB = *MI->getParent();
1092   DebugLoc DL = MI->getDebugLoc();
1093 
1094   if (Pos == Position::AFTER)
1095     ++MI;
1096 
1097   bool VMCnt = false;
1098   bool LGKMCnt = false;
1099 
1100   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1101       SIAtomicAddrSpace::NONE) {
1102     switch (Scope) {
1103     case SIAtomicScope::SYSTEM:
1104     case SIAtomicScope::AGENT:
1105       VMCnt |= true;
1106       break;
1107     case SIAtomicScope::WORKGROUP:
1108     case SIAtomicScope::WAVEFRONT:
1109     case SIAtomicScope::SINGLETHREAD:
1110       // The L1 cache keeps all memory operations in order for
1111       // wavefronts in the same work-group.
1112       break;
1113     default:
1114       llvm_unreachable("Unsupported synchronization scope");
1115     }
1116   }
1117 
1118   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1119     switch (Scope) {
1120     case SIAtomicScope::SYSTEM:
1121     case SIAtomicScope::AGENT:
1122     case SIAtomicScope::WORKGROUP:
1123       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1124       // not needed as LDS operations for all waves are executed in a total
1125       // global ordering as observed by all waves. Required if also
1126       // synchronizing with global/GDS memory as LDS operations could be
1127       // reordered with respect to later global/GDS memory operations of the
1128       // same wave.
1129       LGKMCnt |= IsCrossAddrSpaceOrdering;
1130       break;
1131     case SIAtomicScope::WAVEFRONT:
1132     case SIAtomicScope::SINGLETHREAD:
1133       // The LDS keeps all memory operations in order for
1134       // the same wavefront.
1135       break;
1136     default:
1137       llvm_unreachable("Unsupported synchronization scope");
1138     }
1139   }
1140 
1141   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1142     switch (Scope) {
1143     case SIAtomicScope::SYSTEM:
1144     case SIAtomicScope::AGENT:
1145       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1146       // is not needed as GDS operations for all waves are executed in a total
1147       // global ordering as observed by all waves. Required if also
1148       // synchronizing with global/LDS memory as GDS operations could be
1149       // reordered with respect to later global/LDS memory operations of the
1150       // same wave.
1151       LGKMCnt |= IsCrossAddrSpaceOrdering;
1152       break;
1153     case SIAtomicScope::WORKGROUP:
1154     case SIAtomicScope::WAVEFRONT:
1155     case SIAtomicScope::SINGLETHREAD:
1156       // The GDS keeps all memory operations in order for
1157       // the same work-group.
1158       break;
1159     default:
1160       llvm_unreachable("Unsupported synchronization scope");
1161     }
1162   }
1163 
1164   if (VMCnt || LGKMCnt) {
1165     unsigned WaitCntImmediate =
1166       AMDGPU::encodeWaitcnt(IV,
1167                             VMCnt ? 0 : getVmcntBitMask(IV),
1168                             getExpcntBitMask(IV),
1169                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1170     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1171         .addImm(WaitCntImmediate);
1172     Changed = true;
1173   }
1174 
1175   if (Pos == Position::AFTER)
1176     --MI;
1177 
1178   return Changed;
1179 }
1180 
1181 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1182                                        SIAtomicScope Scope,
1183                                        SIAtomicAddrSpace AddrSpace,
1184                                        Position Pos) const {
1185   if (!InsertCacheInv)
1186     return false;
1187 
1188   bool Changed = false;
1189 
1190   MachineBasicBlock &MBB = *MI->getParent();
1191   DebugLoc DL = MI->getDebugLoc();
1192 
1193   if (Pos == Position::AFTER)
1194     ++MI;
1195 
1196   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1197     switch (Scope) {
1198     case SIAtomicScope::SYSTEM:
1199     case SIAtomicScope::AGENT:
1200       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1201       Changed = true;
1202       break;
1203     case SIAtomicScope::WORKGROUP:
1204     case SIAtomicScope::WAVEFRONT:
1205     case SIAtomicScope::SINGLETHREAD:
1206       // No cache to invalidate.
1207       break;
1208     default:
1209       llvm_unreachable("Unsupported synchronization scope");
1210     }
1211   }
1212 
1213   /// The scratch address space does not need the global memory cache
1214   /// to be flushed as all memory operations by the same thread are
1215   /// sequentially consistent, and no other thread can access scratch
1216   /// memory.
1217 
1218   /// Other address spaces do not have a cache.
1219 
1220   if (Pos == Position::AFTER)
1221     --MI;
1222 
1223   return Changed;
1224 }
1225 
1226 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1227                                        SIAtomicScope Scope,
1228                                        SIAtomicAddrSpace AddrSpace,
1229                                        bool IsCrossAddrSpaceOrdering,
1230                                        Position Pos) const {
1231   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1232                     IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1233 }
1234 
1235 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1236                                        SIAtomicScope Scope,
1237                                        SIAtomicAddrSpace AddrSpace,
1238                                        Position Pos) const {
1239   if (!InsertCacheInv)
1240     return false;
1241 
1242   bool Changed = false;
1243 
1244   MachineBasicBlock &MBB = *MI->getParent();
1245   DebugLoc DL = MI->getDebugLoc();
1246 
1247   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1248 
1249   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1250                                     ? AMDGPU::BUFFER_WBINVL1
1251                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1252 
1253   if (Pos == Position::AFTER)
1254     ++MI;
1255 
1256   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1257     switch (Scope) {
1258     case SIAtomicScope::SYSTEM:
1259     case SIAtomicScope::AGENT:
1260       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1261       Changed = true;
1262       break;
1263     case SIAtomicScope::WORKGROUP:
1264     case SIAtomicScope::WAVEFRONT:
1265     case SIAtomicScope::SINGLETHREAD:
1266       // No cache to invalidate.
1267       break;
1268     default:
1269       llvm_unreachable("Unsupported synchronization scope");
1270     }
1271   }
1272 
1273   /// The scratch address space does not need the global memory cache
1274   /// to be flushed as all memory operations by the same thread are
1275   /// sequentially consistent, and no other thread can access scratch
1276   /// memory.
1277 
1278   /// Other address spaces do not have a cache.
1279 
1280   if (Pos == Position::AFTER)
1281     --MI;
1282 
1283   return Changed;
1284 }
1285 
1286 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1287     const MachineBasicBlock::iterator &MI,
1288     SIAtomicScope Scope,
1289     SIAtomicAddrSpace AddrSpace) const {
1290   assert(MI->mayLoad() && !MI->mayStore());
1291   bool Changed = false;
1292 
1293   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1294     switch (Scope) {
1295     case SIAtomicScope::SYSTEM:
1296     case SIAtomicScope::AGENT:
1297       // Set the L1 cache policy to MISS_LRU.
1298       // Note: there is no L2 cache bypass policy at the ISA level.
1299       Changed |= enableGLCBit(MI);
1300       break;
1301     case SIAtomicScope::WORKGROUP:
1302       // In threadgroup split mode the waves of a work-group can be executing on
1303       // different CUs. Therefore need to bypass the L1 which is per CU.
1304       // Otherwise in non-threadgroup split mode all waves of a work-group are
1305       // on the same CU, and so the L1 does not need to be bypassed.
1306       if (ST.isTgSplitEnabled())
1307         Changed |= enableGLCBit(MI);
1308       break;
1309     case SIAtomicScope::WAVEFRONT:
1310     case SIAtomicScope::SINGLETHREAD:
1311       // No cache to bypass.
1312       break;
1313     default:
1314       llvm_unreachable("Unsupported synchronization scope");
1315     }
1316   }
1317 
1318   /// The scratch address space does not need the global memory caches
1319   /// to be bypassed as all memory operations by the same thread are
1320   /// sequentially consistent, and no other thread can access scratch
1321   /// memory.
1322 
1323   /// Other address spaces do not have a cache.
1324 
1325   return Changed;
1326 }
1327 
1328 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1329     const MachineBasicBlock::iterator &MI,
1330     SIAtomicScope Scope,
1331     SIAtomicAddrSpace AddrSpace) const {
1332   assert(!MI->mayLoad() && MI->mayStore());
1333   bool Changed = false;
1334 
1335   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1336     switch (Scope) {
1337     case SIAtomicScope::SYSTEM:
1338     case SIAtomicScope::AGENT:
1339       /// Do not set glc for store atomic operations as they implicitly write
1340       /// through the L1 cache.
1341       break;
1342     case SIAtomicScope::WORKGROUP:
1343     case SIAtomicScope::WAVEFRONT:
1344     case SIAtomicScope::SINGLETHREAD:
1345       // No cache to bypass. Store atomics implicitly write through the L1
1346       // cache.
1347       break;
1348     default:
1349       llvm_unreachable("Unsupported synchronization scope");
1350     }
1351   }
1352 
1353   /// The scratch address space does not need the global memory caches
1354   /// to be bypassed as all memory operations by the same thread are
1355   /// sequentially consistent, and no other thread can access scratch
1356   /// memory.
1357 
1358   /// Other address spaces do not have a cache.
1359 
1360   return Changed;
1361 }
1362 
1363 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1364     const MachineBasicBlock::iterator &MI,
1365     SIAtomicScope Scope,
1366     SIAtomicAddrSpace AddrSpace) const {
1367   assert(MI->mayLoad() && MI->mayStore());
1368   bool Changed = false;
1369 
1370   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1371     switch (Scope) {
1372     case SIAtomicScope::SYSTEM:
1373     case SIAtomicScope::AGENT:
1374       /// Do not set glc for RMW atomic operations as they implicitly bypass
1375       /// the L1 cache, and the glc bit is instead used to indicate if they are
1376       /// return or no-return.
1377       break;
1378     case SIAtomicScope::WORKGROUP:
1379     case SIAtomicScope::WAVEFRONT:
1380     case SIAtomicScope::SINGLETHREAD:
1381       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1382       break;
1383     default:
1384       llvm_unreachable("Unsupported synchronization scope");
1385     }
1386   }
1387 
1388   return Changed;
1389 }
1390 
1391 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1392     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1393     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1394   // Only handle load and store, not atomic read-modify-write insructions. The
1395   // latter use glc to indicate if the atomic returns a result and so must not
1396   // be used for cache control.
1397   assert(MI->mayLoad() ^ MI->mayStore());
1398 
1399   // Only update load and store, not LLVM IR atomic read-modify-write
1400   // instructions. The latter are always marked as volatile so cannot sensibly
1401   // handle it as do not want to pessimize all atomics. Also they do not support
1402   // the nontemporal attribute.
1403   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1404 
1405   bool Changed = false;
1406 
1407   if (IsVolatile) {
1408     // Set L1 cache policy to be MISS_EVICT for load instructions
1409     // and MISS_LRU for store instructions.
1410     // Note: there is no L2 cache bypass policy at the ISA level.
1411     if (Op == SIMemOp::LOAD)
1412       Changed |= enableGLCBit(MI);
1413 
1414     // Ensure operation has completed at system scope to cause all volatile
1415     // operations to be visible outside the program in a global order. Do not
1416     // request cross address space as only the global address space can be
1417     // observable outside the program, so no need to cause a waitcnt for LDS
1418     // address space operations.
1419     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1420                           Position::AFTER, AtomicOrdering::Unordered);
1421 
1422     return Changed;
1423   }
1424 
1425   if (IsNonTemporal) {
1426     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1427     // for both loads and stores, and the L2 cache policy to STREAM.
1428     Changed |= enableGLCBit(MI);
1429     Changed |= enableSLCBit(MI);
1430     return Changed;
1431   }
1432 
1433   return Changed;
1434 }
1435 
1436 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1437                                       SIAtomicScope Scope,
1438                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1439                                       bool IsCrossAddrSpaceOrdering,
1440                                       Position Pos,
1441                                       AtomicOrdering Order) const {
1442   if (ST.isTgSplitEnabled()) {
1443     // In threadgroup split mode the waves of a work-group can be executing on
1444     // different CUs. Therefore need to wait for global or GDS memory operations
1445     // to complete to ensure they are visible to waves in the other CUs.
1446     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1447     // the same CU, so no need to wait for global memory as all waves in the
1448     // work-group access the same the L1, nor wait for GDS as access are ordered
1449     // on a CU.
1450     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1451                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1452         (Scope == SIAtomicScope::WORKGROUP)) {
1453       // Same as GFX7 using agent scope.
1454       Scope = SIAtomicScope::AGENT;
1455     }
1456     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1457     // LDS memory operations.
1458     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1459   }
1460   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1461                                         IsCrossAddrSpaceOrdering, Pos, Order);
1462 }
1463 
1464 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1465                                          SIAtomicScope Scope,
1466                                          SIAtomicAddrSpace AddrSpace,
1467                                          Position Pos) const {
1468   if (!InsertCacheInv)
1469     return false;
1470 
1471   bool Changed = false;
1472 
1473   MachineBasicBlock &MBB = *MI->getParent();
1474   DebugLoc DL = MI->getDebugLoc();
1475 
1476   if (Pos == Position::AFTER)
1477     ++MI;
1478 
1479   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1480     switch (Scope) {
1481     case SIAtomicScope::SYSTEM:
1482       // Ensures that following loads will not see stale remote VMEM data or
1483       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1484       // CC will never be stale due to the local memory probes.
1485       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1486       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1487       // hardware does not reorder memory operations by the same wave with
1488       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1489       // remove any cache lines of earlier writes by the same wave and ensures
1490       // later reads by the same wave will refetch the cache lines.
1491       Changed = true;
1492       break;
1493     case SIAtomicScope::AGENT:
1494       // Same as GFX7.
1495       break;
1496     case SIAtomicScope::WORKGROUP:
1497       // In threadgroup split mode the waves of a work-group can be executing on
1498       // different CUs. Therefore need to invalidate the L1 which is per CU.
1499       // Otherwise in non-threadgroup split mode all waves of a work-group are
1500       // on the same CU, and so the L1 does not need to be invalidated.
1501       if (ST.isTgSplitEnabled()) {
1502         // Same as GFX7 using agent scope.
1503         Scope = SIAtomicScope::AGENT;
1504       }
1505       break;
1506     case SIAtomicScope::WAVEFRONT:
1507     case SIAtomicScope::SINGLETHREAD:
1508       // Same as GFX7.
1509       break;
1510     default:
1511       llvm_unreachable("Unsupported synchronization scope");
1512     }
1513   }
1514 
1515   /// The scratch address space does not need the global memory cache
1516   /// to be flushed as all memory operations by the same thread are
1517   /// sequentially consistent, and no other thread can access scratch
1518   /// memory.
1519 
1520   /// Other address spaces do not have a cache.
1521 
1522   if (Pos == Position::AFTER)
1523     --MI;
1524 
1525   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1526 
1527   return Changed;
1528 }
1529 
1530 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1531                                          SIAtomicScope Scope,
1532                                          SIAtomicAddrSpace AddrSpace,
1533                                          bool IsCrossAddrSpaceOrdering,
1534                                          Position Pos) const {
1535   bool Changed = false;
1536 
1537   MachineBasicBlock &MBB = *MI->getParent();
1538   const DebugLoc &DL = MI->getDebugLoc();
1539 
1540   if (Pos == Position::AFTER)
1541     ++MI;
1542 
1543   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1544     switch (Scope) {
1545     case SIAtomicScope::SYSTEM:
1546       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1547       // hardware does not reorder memory operations by the same wave with
1548       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1549       // to initiate writeback of any dirty cache lines of earlier writes by the
1550       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1551       // writeback has completed.
1552       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1553         // Set SC bits to indicate system scope.
1554         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1555       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1556       // vmcnt(0)" needed by the "BUFFER_WBL2".
1557       Changed = true;
1558       break;
1559     case SIAtomicScope::AGENT:
1560     case SIAtomicScope::WORKGROUP:
1561     case SIAtomicScope::WAVEFRONT:
1562     case SIAtomicScope::SINGLETHREAD:
1563       // Same as GFX7.
1564       break;
1565     default:
1566       llvm_unreachable("Unsupported synchronization scope");
1567     }
1568   }
1569 
1570   if (Pos == Position::AFTER)
1571     --MI;
1572 
1573   Changed |=
1574       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1575                                         IsCrossAddrSpaceOrdering, Pos);
1576 
1577   return Changed;
1578 }
1579 
1580 bool SIGfx940CacheControl::enableLoadCacheBypass(
1581     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1582     SIAtomicAddrSpace AddrSpace) const {
1583   assert(MI->mayLoad() && !MI->mayStore());
1584   bool Changed = false;
1585 
1586   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1587     switch (Scope) {
1588     case SIAtomicScope::SYSTEM:
1589       // Set SC bits to indicate system scope.
1590       Changed |= enableSC0Bit(MI);
1591       Changed |= enableSC1Bit(MI);
1592       break;
1593     case SIAtomicScope::AGENT:
1594       // Set SC bits to indicate agent scope.
1595       Changed |= enableSC1Bit(MI);
1596       break;
1597     case SIAtomicScope::WORKGROUP:
1598       // In threadgroup split mode the waves of a work-group can be executing on
1599       // different CUs. Therefore need to bypass the L1 which is per CU.
1600       // Otherwise in non-threadgroup split mode all waves of a work-group are
1601       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1602       // bits to indicate work-group scope will do this automatically.
1603       Changed |= enableSC0Bit(MI);
1604       break;
1605     case SIAtomicScope::WAVEFRONT:
1606     case SIAtomicScope::SINGLETHREAD:
1607       // Leave SC bits unset to indicate wavefront scope.
1608       break;
1609     default:
1610       llvm_unreachable("Unsupported synchronization scope");
1611     }
1612   }
1613 
1614   /// The scratch address space does not need the global memory caches
1615   /// to be bypassed as all memory operations by the same thread are
1616   /// sequentially consistent, and no other thread can access scratch
1617   /// memory.
1618 
1619   /// Other address spaces do not have a cache.
1620 
1621   return Changed;
1622 }
1623 
1624 bool SIGfx940CacheControl::enableStoreCacheBypass(
1625     const MachineBasicBlock::iterator &MI,
1626     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1627   assert(!MI->mayLoad() && MI->mayStore());
1628   bool Changed = false;
1629 
1630   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1631     switch (Scope) {
1632     case SIAtomicScope::SYSTEM:
1633       // Set SC bits to indicate system scope.
1634       Changed |= enableSC0Bit(MI);
1635       Changed |= enableSC1Bit(MI);
1636       break;
1637     case SIAtomicScope::AGENT:
1638       // Set SC bits to indicate agent scope.
1639       Changed |= enableSC1Bit(MI);
1640       break;
1641     case SIAtomicScope::WORKGROUP:
1642       // Set SC bits to indicate workgroup scope.
1643       Changed |= enableSC0Bit(MI);
1644       break;
1645     case SIAtomicScope::WAVEFRONT:
1646     case SIAtomicScope::SINGLETHREAD:
1647       // Leave SC bits unset to indicate wavefront scope.
1648       break;
1649     default:
1650       llvm_unreachable("Unsupported synchronization scope");
1651     }
1652   }
1653 
1654   /// The scratch address space does not need the global memory caches
1655   /// to be bypassed as all memory operations by the same thread are
1656   /// sequentially consistent, and no other thread can access scratch
1657   /// memory.
1658 
1659   /// Other address spaces do not have a cache.
1660 
1661   return Changed;
1662 }
1663 
1664 bool SIGfx940CacheControl::enableRMWCacheBypass(
1665     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1666     SIAtomicAddrSpace AddrSpace) const {
1667   assert(MI->mayLoad() && MI->mayStore());
1668   bool Changed = false;
1669 
1670   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1671     switch (Scope) {
1672     case SIAtomicScope::SYSTEM:
1673       // Set SC1 bit to indicate system scope.
1674       Changed |= enableSC1Bit(MI);
1675       break;
1676     case SIAtomicScope::AGENT:
1677     case SIAtomicScope::WORKGROUP:
1678     case SIAtomicScope::WAVEFRONT:
1679     case SIAtomicScope::SINGLETHREAD:
1680       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1681       // to indicate system or agent scope. The SC0 bit is used to indicate if
1682       // they are return or no-return. Leave SC1 bit unset to indicate agent
1683       // scope.
1684       break;
1685     default:
1686       llvm_unreachable("Unsupported synchronization scope");
1687     }
1688   }
1689 
1690   return Changed;
1691 }
1692 
1693 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1694     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1695     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1696   // Only handle load and store, not atomic read-modify-write insructions. The
1697   // latter use glc to indicate if the atomic returns a result and so must not
1698   // be used for cache control.
1699   assert(MI->mayLoad() ^ MI->mayStore());
1700 
1701   // Only update load and store, not LLVM IR atomic read-modify-write
1702   // instructions. The latter are always marked as volatile so cannot sensibly
1703   // handle it as do not want to pessimize all atomics. Also they do not support
1704   // the nontemporal attribute.
1705   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1706 
1707   bool Changed = false;
1708 
1709   if (IsVolatile) {
1710     // Set SC bits to indicate system scope.
1711     Changed |= enableSC0Bit(MI);
1712     Changed |= enableSC1Bit(MI);
1713 
1714     // Ensure operation has completed at system scope to cause all volatile
1715     // operations to be visible outside the program in a global order. Do not
1716     // request cross address space as only the global address space can be
1717     // observable outside the program, so no need to cause a waitcnt for LDS
1718     // address space operations.
1719     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1720                           Position::AFTER, AtomicOrdering::Unordered);
1721 
1722     return Changed;
1723   }
1724 
1725   if (IsNonTemporal) {
1726     Changed |= enableNTBit(MI);
1727     return Changed;
1728   }
1729 
1730   return Changed;
1731 }
1732 
1733 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1734                                          SIAtomicScope Scope,
1735                                          SIAtomicAddrSpace AddrSpace,
1736                                          Position Pos) const {
1737   if (!InsertCacheInv)
1738     return false;
1739 
1740   bool Changed = false;
1741 
1742   MachineBasicBlock &MBB = *MI->getParent();
1743   DebugLoc DL = MI->getDebugLoc();
1744 
1745   if (Pos == Position::AFTER)
1746     ++MI;
1747 
1748   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1749     switch (Scope) {
1750     case SIAtomicScope::SYSTEM:
1751       // Ensures that following loads will not see stale remote VMEM data or
1752       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1753       // CC will never be stale due to the local memory probes.
1754       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1755           // Set SC bits to indicate system scope.
1756           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1757       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1758       // hardware does not reorder memory operations by the same wave with
1759       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1760       // remove any cache lines of earlier writes by the same wave and ensures
1761       // later reads by the same wave will refetch the cache lines.
1762       Changed = true;
1763       break;
1764     case SIAtomicScope::AGENT:
1765       // Ensures that following loads will not see stale remote date or local
1766       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1767       // due to the memory probes.
1768       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1769           // Set SC bits to indicate agent scope.
1770           .addImm(AMDGPU::CPol::SC1);
1771       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1772       // does not reorder memory operations with respect to preceeding buffer
1773       // invalidate. The invalidate is guaranteed to remove any cache lines of
1774       // earlier writes and ensures later writes will refetch the cache lines.
1775       Changed = true;
1776       break;
1777     case SIAtomicScope::WORKGROUP:
1778       // In threadgroup split mode the waves of a work-group can be executing on
1779       // different CUs. Therefore need to invalidate the L1 which is per CU.
1780       // Otherwise in non-threadgroup split mode all waves of a work-group are
1781       // on the same CU, and so the L1 does not need to be invalidated.
1782       if (ST.isTgSplitEnabled()) {
1783         // Ensures L1 is invalidated if in threadgroup split mode. In
1784         // non-threadgroup split mode it is a NOP, but no point generating it in
1785         // that case if know not in that mode.
1786         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1787             // Set SC bits to indicate work-group scope.
1788             .addImm(AMDGPU::CPol::SC0);
1789         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1790         // does not reorder memory operations with respect to preceeding buffer
1791         // invalidate. The invalidate is guaranteed to remove any cache lines of
1792         // earlier writes and ensures later writes will refetch the cache lines.
1793         Changed = true;
1794       }
1795       break;
1796     case SIAtomicScope::WAVEFRONT:
1797     case SIAtomicScope::SINGLETHREAD:
1798       // Could generate "BUFFER_INV" but it would do nothing as there are no
1799       // caches to invalidate.
1800       break;
1801     default:
1802       llvm_unreachable("Unsupported synchronization scope");
1803     }
1804   }
1805 
1806   /// The scratch address space does not need the global memory cache
1807   /// to be flushed as all memory operations by the same thread are
1808   /// sequentially consistent, and no other thread can access scratch
1809   /// memory.
1810 
1811   /// Other address spaces do not have a cache.
1812 
1813   if (Pos == Position::AFTER)
1814     --MI;
1815 
1816   return Changed;
1817 }
1818 
1819 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1820                                          SIAtomicScope Scope,
1821                                          SIAtomicAddrSpace AddrSpace,
1822                                          bool IsCrossAddrSpaceOrdering,
1823                                          Position Pos) const {
1824   bool Changed = false;
1825 
1826   MachineBasicBlock &MBB = *MI->getParent();
1827   DebugLoc DL = MI->getDebugLoc();
1828 
1829   if (Pos == Position::AFTER)
1830     ++MI;
1831 
1832   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1833     switch (Scope) {
1834     case SIAtomicScope::SYSTEM:
1835       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1836       // hardware does not reorder memory operations by the same wave with
1837       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1838       // to initiate writeback of any dirty cache lines of earlier writes by the
1839       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1840       // writeback has completed.
1841       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1842           // Set SC bits to indicate system scope.
1843           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1844       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1845       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1846       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1847       Changed = true;
1848       break;
1849     case SIAtomicScope::AGENT:
1850       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1851           // Set SC bits to indicate agent scope.
1852           .addImm(AMDGPU::CPol::SC1);
1853 
1854       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1855       // SIAtomicScope::AGENT, the following insertWait will generate the
1856       // required "S_WAITCNT vmcnt(0)".
1857       Changed = true;
1858       break;
1859     case SIAtomicScope::WORKGROUP:
1860     case SIAtomicScope::WAVEFRONT:
1861     case SIAtomicScope::SINGLETHREAD:
1862       // Do not generate "BUFFER_WBL2" as there are no caches it would
1863       // writeback, and would require an otherwise unnecessary
1864       // "S_WAITCNT vmcnt(0)".
1865       break;
1866     default:
1867       llvm_unreachable("Unsupported synchronization scope");
1868     }
1869   }
1870 
1871   if (Pos == Position::AFTER)
1872     --MI;
1873 
1874   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1875   // S_WAITCNT needed.
1876   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1877                         IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1878 
1879   return Changed;
1880 }
1881 
1882 bool SIGfx10CacheControl::enableLoadCacheBypass(
1883     const MachineBasicBlock::iterator &MI,
1884     SIAtomicScope Scope,
1885     SIAtomicAddrSpace AddrSpace) const {
1886   assert(MI->mayLoad() && !MI->mayStore());
1887   bool Changed = false;
1888 
1889   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1890     switch (Scope) {
1891     case SIAtomicScope::SYSTEM:
1892     case SIAtomicScope::AGENT:
1893       // Set the L0 and L1 cache policies to MISS_EVICT.
1894       // Note: there is no L2 cache coherent bypass control at the ISA level.
1895       Changed |= enableGLCBit(MI);
1896       Changed |= enableDLCBit(MI);
1897       break;
1898     case SIAtomicScope::WORKGROUP:
1899       // In WGP mode the waves of a work-group can be executing on either CU of
1900       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1901       // CU mode all waves of a work-group are on the same CU, and so the L0
1902       // does not need to be bypassed.
1903       if (!ST.isCuModeEnabled())
1904         Changed |= enableGLCBit(MI);
1905       break;
1906     case SIAtomicScope::WAVEFRONT:
1907     case SIAtomicScope::SINGLETHREAD:
1908       // No cache to bypass.
1909       break;
1910     default:
1911       llvm_unreachable("Unsupported synchronization scope");
1912     }
1913   }
1914 
1915   /// The scratch address space does not need the global memory caches
1916   /// to be bypassed as all memory operations by the same thread are
1917   /// sequentially consistent, and no other thread can access scratch
1918   /// memory.
1919 
1920   /// Other address spaces do not have a cache.
1921 
1922   return Changed;
1923 }
1924 
1925 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1926     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1927     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1928 
1929   // Only handle load and store, not atomic read-modify-write insructions. The
1930   // latter use glc to indicate if the atomic returns a result and so must not
1931   // be used for cache control.
1932   assert(MI->mayLoad() ^ MI->mayStore());
1933 
1934   // Only update load and store, not LLVM IR atomic read-modify-write
1935   // instructions. The latter are always marked as volatile so cannot sensibly
1936   // handle it as do not want to pessimize all atomics. Also they do not support
1937   // the nontemporal attribute.
1938   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1939 
1940   bool Changed = false;
1941 
1942   if (IsVolatile) {
1943     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1944     // and MISS_LRU for store instructions.
1945     // Note: there is no L2 cache coherent bypass control at the ISA level.
1946     if (Op == SIMemOp::LOAD) {
1947       Changed |= enableGLCBit(MI);
1948       Changed |= enableDLCBit(MI);
1949     }
1950 
1951     // Ensure operation has completed at system scope to cause all volatile
1952     // operations to be visible outside the program in a global order. Do not
1953     // request cross address space as only the global address space can be
1954     // observable outside the program, so no need to cause a waitcnt for LDS
1955     // address space operations.
1956     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1957                           Position::AFTER, AtomicOrdering::Unordered);
1958     return Changed;
1959   }
1960 
1961   if (IsNonTemporal) {
1962     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1963     // and L2 cache policy to STREAM.
1964     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1965     // to MISS_EVICT and the L2 cache policy to STREAM.
1966     if (Op == SIMemOp::STORE)
1967       Changed |= enableGLCBit(MI);
1968     Changed |= enableSLCBit(MI);
1969 
1970     return Changed;
1971   }
1972 
1973   return Changed;
1974 }
1975 
1976 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1977                                      SIAtomicScope Scope,
1978                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1979                                      bool IsCrossAddrSpaceOrdering,
1980                                      Position Pos, AtomicOrdering Order) const {
1981   bool Changed = false;
1982 
1983   MachineBasicBlock &MBB = *MI->getParent();
1984   DebugLoc DL = MI->getDebugLoc();
1985 
1986   if (Pos == Position::AFTER)
1987     ++MI;
1988 
1989   bool VMCnt = false;
1990   bool VSCnt = false;
1991   bool LGKMCnt = false;
1992 
1993   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1994       SIAtomicAddrSpace::NONE) {
1995     switch (Scope) {
1996     case SIAtomicScope::SYSTEM:
1997     case SIAtomicScope::AGENT:
1998       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1999         VMCnt |= true;
2000       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2001         VSCnt |= true;
2002       break;
2003     case SIAtomicScope::WORKGROUP:
2004       // In WGP mode the waves of a work-group can be executing on either CU of
2005       // the WGP. Therefore need to wait for operations to complete to ensure
2006       // they are visible to waves in the other CU as the L0 is per CU.
2007       // Otherwise in CU mode and all waves of a work-group are on the same CU
2008       // which shares the same L0.
2009       if (!ST.isCuModeEnabled()) {
2010         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2011           VMCnt |= true;
2012         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2013           VSCnt |= true;
2014       }
2015       break;
2016     case SIAtomicScope::WAVEFRONT:
2017     case SIAtomicScope::SINGLETHREAD:
2018       // The L0 cache keeps all memory operations in order for
2019       // work-items in the same wavefront.
2020       break;
2021     default:
2022       llvm_unreachable("Unsupported synchronization scope");
2023     }
2024   }
2025 
2026   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2027     switch (Scope) {
2028     case SIAtomicScope::SYSTEM:
2029     case SIAtomicScope::AGENT:
2030     case SIAtomicScope::WORKGROUP:
2031       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2032       // not needed as LDS operations for all waves are executed in a total
2033       // global ordering as observed by all waves. Required if also
2034       // synchronizing with global/GDS memory as LDS operations could be
2035       // reordered with respect to later global/GDS memory operations of the
2036       // same wave.
2037       LGKMCnt |= IsCrossAddrSpaceOrdering;
2038       break;
2039     case SIAtomicScope::WAVEFRONT:
2040     case SIAtomicScope::SINGLETHREAD:
2041       // The LDS keeps all memory operations in order for
2042       // the same wavefront.
2043       break;
2044     default:
2045       llvm_unreachable("Unsupported synchronization scope");
2046     }
2047   }
2048 
2049   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2050     switch (Scope) {
2051     case SIAtomicScope::SYSTEM:
2052     case SIAtomicScope::AGENT:
2053       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2054       // is not needed as GDS operations for all waves are executed in a total
2055       // global ordering as observed by all waves. Required if also
2056       // synchronizing with global/LDS memory as GDS operations could be
2057       // reordered with respect to later global/LDS memory operations of the
2058       // same wave.
2059       LGKMCnt |= IsCrossAddrSpaceOrdering;
2060       break;
2061     case SIAtomicScope::WORKGROUP:
2062     case SIAtomicScope::WAVEFRONT:
2063     case SIAtomicScope::SINGLETHREAD:
2064       // The GDS keeps all memory operations in order for
2065       // the same work-group.
2066       break;
2067     default:
2068       llvm_unreachable("Unsupported synchronization scope");
2069     }
2070   }
2071 
2072   if (VMCnt || LGKMCnt) {
2073     unsigned WaitCntImmediate =
2074       AMDGPU::encodeWaitcnt(IV,
2075                             VMCnt ? 0 : getVmcntBitMask(IV),
2076                             getExpcntBitMask(IV),
2077                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2078     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2079         .addImm(WaitCntImmediate);
2080     Changed = true;
2081   }
2082 
2083   if (VSCnt) {
2084     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2085         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2086         .addImm(0);
2087     Changed = true;
2088   }
2089 
2090   if (Pos == Position::AFTER)
2091     --MI;
2092 
2093   return Changed;
2094 }
2095 
2096 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2097                                         SIAtomicScope Scope,
2098                                         SIAtomicAddrSpace AddrSpace,
2099                                         Position Pos) const {
2100   if (!InsertCacheInv)
2101     return false;
2102 
2103   bool Changed = false;
2104 
2105   MachineBasicBlock &MBB = *MI->getParent();
2106   DebugLoc DL = MI->getDebugLoc();
2107 
2108   if (Pos == Position::AFTER)
2109     ++MI;
2110 
2111   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2112     switch (Scope) {
2113     case SIAtomicScope::SYSTEM:
2114     case SIAtomicScope::AGENT:
2115       // The order of invalidates matter here. We must invalidate "outer in"
2116       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2117       // invalidated.
2118       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2119       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2120       Changed = true;
2121       break;
2122     case SIAtomicScope::WORKGROUP:
2123       // In WGP mode the waves of a work-group can be executing on either CU of
2124       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2125       // in CU mode and all waves of a work-group are on the same CU, and so the
2126       // L0 does not need to be invalidated.
2127       if (!ST.isCuModeEnabled()) {
2128         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2129         Changed = true;
2130       }
2131       break;
2132     case SIAtomicScope::WAVEFRONT:
2133     case SIAtomicScope::SINGLETHREAD:
2134       // No cache to invalidate.
2135       break;
2136     default:
2137       llvm_unreachable("Unsupported synchronization scope");
2138     }
2139   }
2140 
2141   /// The scratch address space does not need the global memory cache
2142   /// to be flushed as all memory operations by the same thread are
2143   /// sequentially consistent, and no other thread can access scratch
2144   /// memory.
2145 
2146   /// Other address spaces do not have a cache.
2147 
2148   if (Pos == Position::AFTER)
2149     --MI;
2150 
2151   return Changed;
2152 }
2153 
2154 bool SIGfx11CacheControl::enableLoadCacheBypass(
2155     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2156     SIAtomicAddrSpace AddrSpace) const {
2157   assert(MI->mayLoad() && !MI->mayStore());
2158   bool Changed = false;
2159 
2160   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2161     switch (Scope) {
2162     case SIAtomicScope::SYSTEM:
2163     case SIAtomicScope::AGENT:
2164       // Set the L0 and L1 cache policies to MISS_EVICT.
2165       // Note: there is no L2 cache coherent bypass control at the ISA level.
2166       Changed |= enableGLCBit(MI);
2167       break;
2168     case SIAtomicScope::WORKGROUP:
2169       // In WGP mode the waves of a work-group can be executing on either CU of
2170       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2171       // CU mode all waves of a work-group are on the same CU, and so the L0
2172       // does not need to be bypassed.
2173       if (!ST.isCuModeEnabled())
2174         Changed |= enableGLCBit(MI);
2175       break;
2176     case SIAtomicScope::WAVEFRONT:
2177     case SIAtomicScope::SINGLETHREAD:
2178       // No cache to bypass.
2179       break;
2180     default:
2181       llvm_unreachable("Unsupported synchronization scope");
2182     }
2183   }
2184 
2185   /// The scratch address space does not need the global memory caches
2186   /// to be bypassed as all memory operations by the same thread are
2187   /// sequentially consistent, and no other thread can access scratch
2188   /// memory.
2189 
2190   /// Other address spaces do not have a cache.
2191 
2192   return Changed;
2193 }
2194 
2195 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2196     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2197     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2198 
2199   // Only handle load and store, not atomic read-modify-write insructions. The
2200   // latter use glc to indicate if the atomic returns a result and so must not
2201   // be used for cache control.
2202   assert(MI->mayLoad() ^ MI->mayStore());
2203 
2204   // Only update load and store, not LLVM IR atomic read-modify-write
2205   // instructions. The latter are always marked as volatile so cannot sensibly
2206   // handle it as do not want to pessimize all atomics. Also they do not support
2207   // the nontemporal attribute.
2208   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2209 
2210   bool Changed = false;
2211 
2212   if (IsVolatile) {
2213     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2214     // and MISS_LRU for store instructions.
2215     // Note: there is no L2 cache coherent bypass control at the ISA level.
2216     if (Op == SIMemOp::LOAD)
2217       Changed |= enableGLCBit(MI);
2218 
2219     // Set MALL NOALLOC for load and store instructions.
2220     Changed |= enableDLCBit(MI);
2221 
2222     // Ensure operation has completed at system scope to cause all volatile
2223     // operations to be visible outside the program in a global order. Do not
2224     // request cross address space as only the global address space can be
2225     // observable outside the program, so no need to cause a waitcnt for LDS
2226     // address space operations.
2227     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2228                           Position::AFTER, AtomicOrdering::Unordered);
2229     return Changed;
2230   }
2231 
2232   if (IsNonTemporal) {
2233     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2234     // and L2 cache policy to STREAM.
2235     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2236     // to MISS_EVICT and the L2 cache policy to STREAM.
2237     if (Op == SIMemOp::STORE)
2238       Changed |= enableGLCBit(MI);
2239     Changed |= enableSLCBit(MI);
2240 
2241     // Set MALL NOALLOC for load and store instructions.
2242     Changed |= enableDLCBit(MI);
2243     return Changed;
2244   }
2245 
2246   return Changed;
2247 }
2248 
2249 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2250                                 AMDGPU::CPol::CPol Value) const {
2251   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2252   if (!CPol)
2253     return false;
2254 
2255   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2256   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2257     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2258     return true;
2259   }
2260 
2261   return false;
2262 }
2263 
2264 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2265                                    AMDGPU::CPol::CPol Value) const {
2266   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2267   if (!CPol)
2268     return false;
2269 
2270   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2271   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2272     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2273     return true;
2274   }
2275 
2276   return false;
2277 }
2278 
2279 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2280     const MachineBasicBlock::iterator MI) const {
2281   // TODO: implement flag for frontend to give us a hint not to insert waits.
2282 
2283   MachineBasicBlock &MBB = *MI->getParent();
2284   const DebugLoc &DL = MI->getDebugLoc();
2285 
2286   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2287   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2288   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2289   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2290   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2291 
2292   return true;
2293 }
2294 
2295 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2296                                      SIAtomicScope Scope,
2297                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2298                                      bool IsCrossAddrSpaceOrdering,
2299                                      Position Pos, AtomicOrdering Order) const {
2300   bool Changed = false;
2301 
2302   MachineBasicBlock &MBB = *MI->getParent();
2303   DebugLoc DL = MI->getDebugLoc();
2304 
2305   bool LOADCnt = false;
2306   bool DSCnt = false;
2307   bool STORECnt = false;
2308 
2309   if (Pos == Position::AFTER)
2310     ++MI;
2311 
2312   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2313       SIAtomicAddrSpace::NONE) {
2314     switch (Scope) {
2315     case SIAtomicScope::SYSTEM:
2316     case SIAtomicScope::AGENT:
2317       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2318         LOADCnt |= true;
2319       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2320         STORECnt |= true;
2321       break;
2322     case SIAtomicScope::WORKGROUP:
2323       // In WGP mode the waves of a work-group can be executing on either CU of
2324       // the WGP. Therefore need to wait for operations to complete to ensure
2325       // they are visible to waves in the other CU as the L0 is per CU.
2326       // Otherwise in CU mode and all waves of a work-group are on the same CU
2327       // which shares the same L0.
2328       if (!ST.isCuModeEnabled()) {
2329         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2330           LOADCnt |= true;
2331         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2332           STORECnt |= true;
2333       }
2334       break;
2335     case SIAtomicScope::WAVEFRONT:
2336     case SIAtomicScope::SINGLETHREAD:
2337       // The L0 cache keeps all memory operations in order for
2338       // work-items in the same wavefront.
2339       break;
2340     default:
2341       llvm_unreachable("Unsupported synchronization scope");
2342     }
2343   }
2344 
2345   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2346     switch (Scope) {
2347     case SIAtomicScope::SYSTEM:
2348     case SIAtomicScope::AGENT:
2349     case SIAtomicScope::WORKGROUP:
2350       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2351       // not needed as LDS operations for all waves are executed in a total
2352       // global ordering as observed by all waves. Required if also
2353       // synchronizing with global/GDS memory as LDS operations could be
2354       // reordered with respect to later global/GDS memory operations of the
2355       // same wave.
2356       DSCnt |= IsCrossAddrSpaceOrdering;
2357       break;
2358     case SIAtomicScope::WAVEFRONT:
2359     case SIAtomicScope::SINGLETHREAD:
2360       // The LDS keeps all memory operations in order for
2361       // the same wavefront.
2362       break;
2363     default:
2364       llvm_unreachable("Unsupported synchronization scope");
2365     }
2366   }
2367 
2368   if (LOADCnt) {
2369     // Acquire sequences only need to wait on the previous atomic operation.
2370     // e.g. a typical sequence looks like
2371     //    atomic load
2372     //    (wait)
2373     //    global_inv
2374     //
2375     // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2376     // to be tracked using loadcnt.
2377     //
2378     // This also applies to fences. Fences cannot pair with an instruction
2379     // tracked with bvh/samplecnt as we don't have any atomics that do that.
2380     if (Order != AtomicOrdering::Acquire) {
2381       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2382       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2383     }
2384     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2385     Changed = true;
2386   }
2387 
2388   if (STORECnt) {
2389     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2390     Changed = true;
2391   }
2392 
2393   if (DSCnt) {
2394     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2395     Changed = true;
2396   }
2397 
2398   if (Pos == Position::AFTER)
2399     --MI;
2400 
2401   return Changed;
2402 }
2403 
2404 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2405                                         SIAtomicScope Scope,
2406                                         SIAtomicAddrSpace AddrSpace,
2407                                         Position Pos) const {
2408   if (!InsertCacheInv)
2409     return false;
2410 
2411   MachineBasicBlock &MBB = *MI->getParent();
2412   DebugLoc DL = MI->getDebugLoc();
2413 
2414   /// The scratch address space does not need the global memory cache
2415   /// to be flushed as all memory operations by the same thread are
2416   /// sequentially consistent, and no other thread can access scratch
2417   /// memory.
2418 
2419   /// Other address spaces do not have a cache.
2420   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2421     return false;
2422 
2423   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2424   switch (Scope) {
2425   case SIAtomicScope::SYSTEM:
2426     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2427     break;
2428   case SIAtomicScope::AGENT:
2429     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2430     break;
2431   case SIAtomicScope::WORKGROUP:
2432     // In WGP mode the waves of a work-group can be executing on either CU of
2433     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2434     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2435     // the L0 does not need to be invalidated.
2436     if (ST.isCuModeEnabled())
2437       return false;
2438 
2439     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2440     break;
2441   case SIAtomicScope::WAVEFRONT:
2442   case SIAtomicScope::SINGLETHREAD:
2443     // No cache to invalidate.
2444     return false;
2445   default:
2446     llvm_unreachable("Unsupported synchronization scope");
2447   }
2448 
2449   if (Pos == Position::AFTER)
2450     ++MI;
2451 
2452   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2453 
2454   if (Pos == Position::AFTER)
2455     --MI;
2456 
2457   return true;
2458 }
2459 
2460 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2461                                         SIAtomicScope Scope,
2462                                         SIAtomicAddrSpace AddrSpace,
2463                                         bool IsCrossAddrSpaceOrdering,
2464                                         Position Pos) const {
2465   MachineBasicBlock &MBB = *MI->getParent();
2466   DebugLoc DL = MI->getDebugLoc();
2467 
2468   // The scratch address space does not need the global memory cache
2469   // writeback as all memory operations by the same thread are
2470   // sequentially consistent, and no other thread can access scratch
2471   // memory.
2472 
2473   // Other address spaces do not have a cache.
2474   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2475     return false;
2476 
2477   if (Pos == Position::AFTER)
2478     ++MI;
2479 
2480   // global_wb is only necessary at system scope for gfx120x targets.
2481   //
2482   // Emitting it for lower scopes is a slow no-op, so we omit it
2483   // for performance.
2484   switch (Scope) {
2485   case SIAtomicScope::SYSTEM:
2486     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2487         .addImm(AMDGPU::CPol::SCOPE_SYS);
2488     break;
2489   case SIAtomicScope::AGENT:
2490   case SIAtomicScope::WORKGROUP:
2491     // No WB necessary, but we still have to wait.
2492     break;
2493   case SIAtomicScope::WAVEFRONT:
2494   case SIAtomicScope::SINGLETHREAD:
2495     // No WB or wait necessary here.
2496     return false;
2497   default:
2498     llvm_unreachable("Unsupported synchronization scope");
2499   }
2500 
2501   if (Pos == Position::AFTER)
2502     --MI;
2503 
2504   // We always have to wait for previous memory operations (load/store) to
2505   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2506   // we of course need to wait for that as well.
2507   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2508              IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2509 
2510   return true;
2511 }
2512 
2513 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2514     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2515     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2516 
2517   // Only handle load and store, not atomic read-modify-write instructions.
2518   assert(MI->mayLoad() ^ MI->mayStore());
2519 
2520   // Only update load and store, not LLVM IR atomic read-modify-write
2521   // instructions. The latter are always marked as volatile so cannot sensibly
2522   // handle it as do not want to pessimize all atomics. Also they do not support
2523   // the nontemporal attribute.
2524   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2525 
2526   bool Changed = false;
2527 
2528   if (IsLastUse) {
2529     // Set last-use hint.
2530     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2531   } else if (IsNonTemporal) {
2532     // Set non-temporal hint for all cache levels.
2533     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2534   }
2535 
2536   if (IsVolatile) {
2537     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2538 
2539     if (Op == SIMemOp::STORE)
2540       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2541 
2542     // Ensure operation has completed at system scope to cause all volatile
2543     // operations to be visible outside the program in a global order. Do not
2544     // request cross address space as only the global address space can be
2545     // observable outside the program, so no need to cause a waitcnt for LDS
2546     // address space operations.
2547     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2548                           Position::AFTER, AtomicOrdering::Unordered);
2549   }
2550 
2551   return Changed;
2552 }
2553 
2554 bool SIGfx12CacheControl::expandSystemScopeStore(
2555     MachineBasicBlock::iterator &MI) const {
2556   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2557   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2558     return insertWaitsBeforeSystemScopeStore(MI);
2559 
2560   return false;
2561 }
2562 
2563 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2564                                          SIAtomicScope Scope,
2565                                          SIAtomicAddrSpace AddrSpace) const {
2566   bool Changed = false;
2567 
2568   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2569     switch (Scope) {
2570     case SIAtomicScope::SYSTEM:
2571       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2572       break;
2573     case SIAtomicScope::AGENT:
2574       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2575       break;
2576     case SIAtomicScope::WORKGROUP:
2577       // In workgroup mode, SCOPE_SE is needed as waves can executes on
2578       // different CUs that access different L0s.
2579       if (!ST.isCuModeEnabled())
2580         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2581       break;
2582     case SIAtomicScope::WAVEFRONT:
2583     case SIAtomicScope::SINGLETHREAD:
2584       // No cache to bypass.
2585       break;
2586     default:
2587       llvm_unreachable("Unsupported synchronization scope");
2588     }
2589   }
2590 
2591   // The scratch address space does not need the global memory caches
2592   // to be bypassed as all memory operations by the same thread are
2593   // sequentially consistent, and no other thread can access scratch
2594   // memory.
2595 
2596   // Other address spaces do not have a cache.
2597 
2598   return Changed;
2599 }
2600 
2601 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2602   if (AtomicPseudoMIs.empty())
2603     return false;
2604 
2605   for (auto &MI : AtomicPseudoMIs)
2606     MI->eraseFromParent();
2607 
2608   AtomicPseudoMIs.clear();
2609   return true;
2610 }
2611 
2612 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2613                                    MachineBasicBlock::iterator &MI) {
2614   assert(MI->mayLoad() && !MI->mayStore());
2615 
2616   bool Changed = false;
2617 
2618   if (MOI.isAtomic()) {
2619     const AtomicOrdering Order = MOI.getOrdering();
2620     if (Order == AtomicOrdering::Monotonic ||
2621         Order == AtomicOrdering::Acquire ||
2622         Order == AtomicOrdering::SequentiallyConsistent) {
2623       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2624                                            MOI.getOrderingAddrSpace());
2625     }
2626 
2627     if (Order == AtomicOrdering::SequentiallyConsistent)
2628       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2629                                 SIMemOp::LOAD | SIMemOp::STORE,
2630                                 MOI.getIsCrossAddressSpaceOrdering(),
2631                                 Position::BEFORE, Order);
2632 
2633     if (Order == AtomicOrdering::Acquire ||
2634         Order == AtomicOrdering::SequentiallyConsistent) {
2635       Changed |= CC->insertWait(
2636           MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2637           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2638       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2639                                    MOI.getOrderingAddrSpace(),
2640                                    Position::AFTER);
2641     }
2642 
2643     return Changed;
2644   }
2645 
2646   // Atomic instructions already bypass caches to the scope specified by the
2647   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2648   // instructions need additional treatment.
2649   Changed |= CC->enableVolatileAndOrNonTemporal(
2650       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2651       MOI.isNonTemporal(), MOI.isLastUse());
2652 
2653   return Changed;
2654 }
2655 
2656 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2657                                     MachineBasicBlock::iterator &MI) {
2658   assert(!MI->mayLoad() && MI->mayStore());
2659 
2660   bool Changed = false;
2661 
2662   if (MOI.isAtomic()) {
2663     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2664         MOI.getOrdering() == AtomicOrdering::Release ||
2665         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2666       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2667                                             MOI.getOrderingAddrSpace());
2668     }
2669 
2670     if (MOI.getOrdering() == AtomicOrdering::Release ||
2671         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2672       Changed |= CC->insertRelease(MI, MOI.getScope(),
2673                                    MOI.getOrderingAddrSpace(),
2674                                    MOI.getIsCrossAddressSpaceOrdering(),
2675                                    Position::BEFORE);
2676 
2677     return Changed;
2678   }
2679 
2680   // Atomic instructions already bypass caches to the scope specified by the
2681   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2682   // need additional treatment.
2683   Changed |= CC->enableVolatileAndOrNonTemporal(
2684       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2685       MOI.isNonTemporal());
2686 
2687   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2688   // instruction field, do not confuse it with atomic scope.
2689   Changed |= CC->expandSystemScopeStore(MI);
2690   return Changed;
2691 }
2692 
2693 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2694                                           MachineBasicBlock::iterator &MI) {
2695   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2696 
2697   AtomicPseudoMIs.push_back(MI);
2698   bool Changed = false;
2699 
2700   // Refine fenced address space based on MMRAs.
2701   //
2702   // TODO: Should we support this MMRA on other atomic operations?
2703   auto OrderingAddrSpace =
2704       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2705 
2706   if (MOI.isAtomic()) {
2707     const AtomicOrdering Order = MOI.getOrdering();
2708     if (Order == AtomicOrdering::Acquire) {
2709       Changed |= CC->insertWait(
2710           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2711           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2712     }
2713 
2714     if (Order == AtomicOrdering::Release ||
2715         Order == AtomicOrdering::AcquireRelease ||
2716         Order == AtomicOrdering::SequentiallyConsistent)
2717       /// TODO: This relies on a barrier always generating a waitcnt
2718       /// for LDS to ensure it is not reordered with the completion of
2719       /// the proceeding LDS operations. If barrier had a memory
2720       /// ordering and memory scope, then library does not need to
2721       /// generate a fence. Could add support in this file for
2722       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2723       /// adding S_WAITCNT before a S_BARRIER.
2724       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2725                                    MOI.getIsCrossAddressSpaceOrdering(),
2726                                    Position::BEFORE);
2727 
2728     // TODO: If both release and invalidate are happening they could be combined
2729     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2730     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2731     // track cache invalidate and write back instructions.
2732 
2733     if (Order == AtomicOrdering::Acquire ||
2734         Order == AtomicOrdering::AcquireRelease ||
2735         Order == AtomicOrdering::SequentiallyConsistent)
2736       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2737                                    Position::BEFORE);
2738 
2739     return Changed;
2740   }
2741 
2742   return Changed;
2743 }
2744 
2745 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2746   MachineBasicBlock::iterator &MI) {
2747   assert(MI->mayLoad() && MI->mayStore());
2748 
2749   bool Changed = false;
2750 
2751   if (MOI.isAtomic()) {
2752     const AtomicOrdering Order = MOI.getOrdering();
2753     if (Order == AtomicOrdering::Monotonic ||
2754         Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2755         Order == AtomicOrdering::AcquireRelease ||
2756         Order == AtomicOrdering::SequentiallyConsistent) {
2757       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2758                                           MOI.getInstrAddrSpace());
2759     }
2760 
2761     if (Order == AtomicOrdering::Release ||
2762         Order == AtomicOrdering::AcquireRelease ||
2763         Order == AtomicOrdering::SequentiallyConsistent ||
2764         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2765       Changed |= CC->insertRelease(MI, MOI.getScope(),
2766                                    MOI.getOrderingAddrSpace(),
2767                                    MOI.getIsCrossAddressSpaceOrdering(),
2768                                    Position::BEFORE);
2769 
2770     if (Order == AtomicOrdering::Acquire ||
2771         Order == AtomicOrdering::AcquireRelease ||
2772         Order == AtomicOrdering::SequentiallyConsistent ||
2773         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2774         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2775       Changed |= CC->insertWait(
2776           MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2777           isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2778           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2779       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2780                                    MOI.getOrderingAddrSpace(),
2781                                    Position::AFTER);
2782     }
2783 
2784     return Changed;
2785   }
2786 
2787   return Changed;
2788 }
2789 
2790 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2791   bool Changed = false;
2792 
2793   const MachineModuleInfo &MMI =
2794       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2795 
2796   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2797   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2798 
2799   for (auto &MBB : MF) {
2800     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2801 
2802       // Unbundle instructions after the post-RA scheduler.
2803       if (MI->isBundle() && MI->mayLoadOrStore()) {
2804         MachineBasicBlock::instr_iterator II(MI->getIterator());
2805         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2806              I != E && I->isBundledWithPred(); ++I) {
2807           I->unbundleFromPred();
2808           for (MachineOperand &MO : I->operands())
2809             if (MO.isReg())
2810               MO.setIsInternalRead(false);
2811         }
2812 
2813         MI->eraseFromParent();
2814         MI = II->getIterator();
2815       }
2816 
2817       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2818         continue;
2819 
2820       if (const auto &MOI = MOA.getLoadInfo(MI))
2821         Changed |= expandLoad(*MOI, MI);
2822       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2823         Changed |= expandStore(*MOI, MI);
2824         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2825       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2826         Changed |= expandAtomicFence(*MOI, MI);
2827       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2828         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2829     }
2830   }
2831 
2832   Changed |= removeAtomicPseudoMIs();
2833   return Changed;
2834 }
2835 
2836 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2837 
2838 char SIMemoryLegalizer::ID = 0;
2839 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2840 
2841 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2842   return new SIMemoryLegalizer();
2843 }
2844