xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 1fd1f4c0e1f229a2eceb94fc4e41bdd4b9ca7d5a)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101   bool IsLastUse = false;
102 
103   SIMemOpInfo(
104       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
105       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
106       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
107       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
108       bool IsCrossAddressSpaceOrdering = true,
109       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
110       bool IsVolatile = false, bool IsNonTemporal = false,
111       bool IsLastUse = false)
112       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
113         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
114         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
116         IsLastUse(IsLastUse) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if memory access of the machine instruction used to
205   /// create this SIMemOpInfo is last use, false otherwise.
206   bool isLastUse() const { return IsLastUse; }
207 
208   /// \returns True if ordering constraint of the machine instruction used to
209   /// create this SIMemOpInfo is unordered or higher, false otherwise.
210   bool isAtomic() const {
211     return Ordering != AtomicOrdering::NotAtomic;
212   }
213 
214 };
215 
216 class SIMemOpAccess final {
217 private:
218   AMDGPUMachineModuleInfo *MMI = nullptr;
219 
220   /// Reports unsupported message \p Msg for \p MI to LLVM context.
221   void reportUnsupported(const MachineBasicBlock::iterator &MI,
222                          const char *Msg) const;
223 
224   /// Inspects the target synchronization scope \p SSID and determines
225   /// the SI atomic scope it corresponds to, the address spaces it
226   /// covers, and whether the memory ordering applies between address
227   /// spaces.
228   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
229   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
230 
231   /// \return Return a bit set of the address spaces accessed by \p AS.
232   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
233 
234   /// \returns Info constructed from \p MI, which has at least machine memory
235   /// operand.
236   std::optional<SIMemOpInfo>
237   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
238 
239 public:
240   /// Construct class to support accessing the machine memory operands
241   /// of instructions in the machine function \p MF.
242   SIMemOpAccess(MachineFunction &MF);
243 
244   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
245   std::optional<SIMemOpInfo>
246   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Store info if \p MI is a store operation, "std::nullopt"
249   /// otherwise.
250   std::optional<SIMemOpInfo>
251   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic fence info if \p MI is an atomic fence operation,
254   /// "std::nullopt" otherwise.
255   std::optional<SIMemOpInfo>
256   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
257 
258   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
259   /// rmw operation, "std::nullopt" otherwise.
260   std::optional<SIMemOpInfo>
261   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
262 };
263 
264 class SICacheControl {
265 protected:
266 
267   /// AMDGPU subtarget info.
268   const GCNSubtarget &ST;
269 
270   /// Instruction info.
271   const SIInstrInfo *TII = nullptr;
272 
273   IsaVersion IV;
274 
275   /// Whether to insert cache invalidating instructions.
276   bool InsertCacheInv;
277 
278   SICacheControl(const GCNSubtarget &ST);
279 
280   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
281   /// \returns Returns true if \p MI is modified, false otherwise.
282   bool enableNamedBit(const MachineBasicBlock::iterator MI,
283                       AMDGPU::CPol::CPol Bit) const;
284 
285 public:
286 
287   /// Create a cache control for the subtarget \p ST.
288   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
289 
290   /// Update \p MI memory load instruction to bypass any caches up to
291   /// the \p Scope memory scope for address spaces \p
292   /// AddrSpace. Return true iff the instruction was modified.
293   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
294                                      SIAtomicScope Scope,
295                                      SIAtomicAddrSpace AddrSpace) const = 0;
296 
297   /// Update \p MI memory store instruction to bypass any caches up to
298   /// the \p Scope memory scope for address spaces \p
299   /// AddrSpace. Return true iff the instruction was modified.
300   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
301                                       SIAtomicScope Scope,
302                                       SIAtomicAddrSpace AddrSpace) const = 0;
303 
304   /// Update \p MI memory read-modify-write instruction to bypass any caches up
305   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
306   /// iff the instruction was modified.
307   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
308                                     SIAtomicScope Scope,
309                                     SIAtomicAddrSpace AddrSpace) const = 0;
310 
311   /// Update \p MI memory instruction of kind \p Op associated with address
312   /// spaces \p AddrSpace to indicate it is volatile and/or
313   /// nontemporal/last-use. Return true iff the instruction was modified.
314   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
315                                               SIAtomicAddrSpace AddrSpace,
316                                               SIMemOp Op, bool IsVolatile,
317                                               bool IsNonTemporal,
318                                               bool IsLastUse = false) const = 0;
319 
320   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
321     return false;
322   };
323 
324   /// Inserts any necessary instructions at position \p Pos relative
325   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
326   /// \p Op associated with address spaces \p AddrSpace have completed. Used
327   /// between memory instructions to enforce the order they become visible as
328   /// observed by other memory instructions executing in memory scope \p Scope.
329   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
330   /// address spaces. Returns true iff any instructions inserted.
331   virtual bool insertWait(MachineBasicBlock::iterator &MI,
332                           SIAtomicScope Scope,
333                           SIAtomicAddrSpace AddrSpace,
334                           SIMemOp Op,
335                           bool IsCrossAddrSpaceOrdering,
336                           Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure any subsequent memory instructions of this
340   /// thread with address spaces \p AddrSpace will observe the previous memory
341   /// operations by any thread for memory scopes up to memory scope \p Scope .
342   /// Returns true iff any instructions inserted.
343   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              Position Pos) const = 0;
347 
348   /// Inserts any necessary instructions at position \p Pos relative to
349   /// instruction \p MI to ensure previous memory instructions by this thread
350   /// with address spaces \p AddrSpace have completed and can be observed by
351   /// subsequent memory instructions by any thread executing in memory scope \p
352   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
353   /// between address spaces. Returns true iff any instructions inserted.
354   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
355                              SIAtomicScope Scope,
356                              SIAtomicAddrSpace AddrSpace,
357                              bool IsCrossAddrSpaceOrdering,
358                              Position Pos) const = 0;
359 
360   /// Virtual destructor to allow derivations to be deleted.
361   virtual ~SICacheControl() = default;
362 
363   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
364                                    MachineBasicBlock::iterator &MI) const {
365     return false;
366   }
367 };
368 
369 class SIGfx6CacheControl : public SICacheControl {
370 protected:
371 
372   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
373   /// is modified, false otherwise.
374   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
375     return enableNamedBit(MI, AMDGPU::CPol::GLC);
376   }
377 
378   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
379   /// is modified, false otherwise.
380   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
381     return enableNamedBit(MI, AMDGPU::CPol::SLC);
382   }
383 
384 public:
385 
386   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
387 
388   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
389                              SIAtomicScope Scope,
390                              SIAtomicAddrSpace AddrSpace) const override;
391 
392   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
393                               SIAtomicScope Scope,
394                               SIAtomicAddrSpace AddrSpace) const override;
395 
396   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
397                             SIAtomicScope Scope,
398                             SIAtomicAddrSpace AddrSpace) const override;
399 
400   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
402                                       bool IsVolatile, bool IsNonTemporal,
403                                       bool IsLastUse) const override;
404 
405   bool insertWait(MachineBasicBlock::iterator &MI,
406                   SIAtomicScope Scope,
407                   SIAtomicAddrSpace AddrSpace,
408                   SIMemOp Op,
409                   bool IsCrossAddrSpaceOrdering,
410                   Position Pos) const override;
411 
412   bool insertAcquire(MachineBasicBlock::iterator &MI,
413                      SIAtomicScope Scope,
414                      SIAtomicAddrSpace AddrSpace,
415                      Position Pos) const override;
416 
417   bool insertRelease(MachineBasicBlock::iterator &MI,
418                      SIAtomicScope Scope,
419                      SIAtomicAddrSpace AddrSpace,
420                      bool IsCrossAddrSpaceOrdering,
421                      Position Pos) const override;
422 };
423 
424 class SIGfx7CacheControl : public SIGfx6CacheControl {
425 public:
426 
427   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
428 
429   bool insertAcquire(MachineBasicBlock::iterator &MI,
430                      SIAtomicScope Scope,
431                      SIAtomicAddrSpace AddrSpace,
432                      Position Pos) const override;
433 
434 };
435 
436 class SIGfx90ACacheControl : public SIGfx7CacheControl {
437 public:
438 
439   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
440 
441   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
442                              SIAtomicScope Scope,
443                              SIAtomicAddrSpace AddrSpace) const override;
444 
445   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
446                               SIAtomicScope Scope,
447                               SIAtomicAddrSpace AddrSpace) const override;
448 
449   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
450                             SIAtomicScope Scope,
451                             SIAtomicAddrSpace AddrSpace) const override;
452 
453   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
454                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455                                       bool IsVolatile, bool IsNonTemporal,
456                                       bool IsLastUse) const override;
457 
458   bool insertWait(MachineBasicBlock::iterator &MI,
459                   SIAtomicScope Scope,
460                   SIAtomicAddrSpace AddrSpace,
461                   SIMemOp Op,
462                   bool IsCrossAddrSpaceOrdering,
463                   Position Pos) const override;
464 
465   bool insertAcquire(MachineBasicBlock::iterator &MI,
466                      SIAtomicScope Scope,
467                      SIAtomicAddrSpace AddrSpace,
468                      Position Pos) const override;
469 
470   bool insertRelease(MachineBasicBlock::iterator &MI,
471                      SIAtomicScope Scope,
472                      SIAtomicAddrSpace AddrSpace,
473                      bool IsCrossAddrSpaceOrdering,
474                      Position Pos) const override;
475 };
476 
477 class SIGfx940CacheControl : public SIGfx90ACacheControl {
478 protected:
479 
480   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
481   /// is modified, false otherwise.
482   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
483     return enableNamedBit(MI, AMDGPU::CPol::SC0);
484   }
485 
486   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
487   /// is modified, false otherwise.
488   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
489     return enableNamedBit(MI, AMDGPU::CPol::SC1);
490   }
491 
492   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
493   /// is modified, false otherwise.
494   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
495     return enableNamedBit(MI, AMDGPU::CPol::NT);
496   }
497 
498 public:
499 
500   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
501 
502   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
503                              SIAtomicScope Scope,
504                              SIAtomicAddrSpace AddrSpace) const override;
505 
506   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
507                               SIAtomicScope Scope,
508                               SIAtomicAddrSpace AddrSpace) const override;
509 
510   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
511                             SIAtomicScope Scope,
512                             SIAtomicAddrSpace AddrSpace) const override;
513 
514   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
515                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
516                                       bool IsVolatile, bool IsNonTemporal,
517                                       bool IsLastUse) const override;
518 
519   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
520                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
521 
522   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
523                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
524                      Position Pos) const override;
525 
526   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
527                            MachineBasicBlock::iterator &MI) const override {
528     bool Changed = false;
529     if (ST.hasForceStoreSC0SC1() &&
530         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
531                                     SIAtomicAddrSpace::GLOBAL |
532                                     SIAtomicAddrSpace::OTHER)) !=
533          SIAtomicAddrSpace::NONE) {
534       Changed |= enableSC0Bit(MI);
535       Changed |= enableSC1Bit(MI);
536     }
537     return Changed;
538   }
539 };
540 
541 class SIGfx10CacheControl : public SIGfx7CacheControl {
542 protected:
543 
544   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
545   /// is modified, false otherwise.
546   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
547     return enableNamedBit(MI, AMDGPU::CPol::DLC);
548   }
549 
550 public:
551 
552   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
553 
554   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
555                              SIAtomicScope Scope,
556                              SIAtomicAddrSpace AddrSpace) const override;
557 
558   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
559                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
560                                       bool IsVolatile, bool IsNonTemporal,
561                                       bool IsLastUse) const override;
562 
563   bool insertWait(MachineBasicBlock::iterator &MI,
564                   SIAtomicScope Scope,
565                   SIAtomicAddrSpace AddrSpace,
566                   SIMemOp Op,
567                   bool IsCrossAddrSpaceOrdering,
568                   Position Pos) const override;
569 
570   bool insertAcquire(MachineBasicBlock::iterator &MI,
571                      SIAtomicScope Scope,
572                      SIAtomicAddrSpace AddrSpace,
573                      Position Pos) const override;
574 };
575 
576 class SIGfx11CacheControl : public SIGfx10CacheControl {
577 public:
578   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
579 
580   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
581                              SIAtomicScope Scope,
582                              SIAtomicAddrSpace AddrSpace) const override;
583 
584   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
585                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
586                                       bool IsVolatile, bool IsNonTemporal,
587                                       bool IsLastUse) const override;
588 };
589 
590 class SIGfx12CacheControl : public SIGfx11CacheControl {
591 protected:
592   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
593   // \returns Returns true if \p MI is modified, false otherwise.
594   bool setTH(const MachineBasicBlock::iterator MI,
595              AMDGPU::CPol::CPol Value) const;
596   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
597   // MI. \returns Returns true if \p MI is modified, false otherwise.
598   bool setScope(const MachineBasicBlock::iterator MI,
599                 AMDGPU::CPol::CPol Value) const;
600 
601   // Stores with system scope (SCOPE_SYS) need to wait for:
602   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
603   // - non-returning-atomics       - wait for STORECNT==0
604   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
605   //   since it does not distinguish atomics-with-return from regular stores.
606   // There is no need to wait if memory is cached (mtype != UC).
607   bool
608   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
609 
610 public:
611   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
612 
613   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
614                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
615                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
616 
617   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
618                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
619 
620   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
621                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
622                                       bool IsVolatile, bool IsNonTemporal,
623                                       bool IsLastUse) const override;
624 
625   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
626 };
627 
628 class SIMemoryLegalizer final : public MachineFunctionPass {
629 private:
630 
631   /// Cache Control.
632   std::unique_ptr<SICacheControl> CC = nullptr;
633 
634   /// List of atomic pseudo instructions.
635   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
636 
637   /// Return true iff instruction \p MI is a atomic instruction that
638   /// returns a result.
639   bool isAtomicRet(const MachineInstr &MI) const {
640     return SIInstrInfo::isAtomicRet(MI);
641   }
642 
643   /// Removes all processed atomic pseudo instructions from the current
644   /// function. Returns true if current function is modified, false otherwise.
645   bool removeAtomicPseudoMIs();
646 
647   /// Expands load operation \p MI. Returns true if instructions are
648   /// added/deleted or \p MI is modified, false otherwise.
649   bool expandLoad(const SIMemOpInfo &MOI,
650                   MachineBasicBlock::iterator &MI);
651   /// Expands store operation \p MI. Returns true if instructions are
652   /// added/deleted or \p MI is modified, false otherwise.
653   bool expandStore(const SIMemOpInfo &MOI,
654                    MachineBasicBlock::iterator &MI);
655   /// Expands atomic fence operation \p MI. Returns true if
656   /// instructions are added/deleted or \p MI is modified, false otherwise.
657   bool expandAtomicFence(const SIMemOpInfo &MOI,
658                          MachineBasicBlock::iterator &MI);
659   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
660   /// instructions are added/deleted or \p MI is modified, false otherwise.
661   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
662                                 MachineBasicBlock::iterator &MI);
663 
664 public:
665   static char ID;
666 
667   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
668 
669   void getAnalysisUsage(AnalysisUsage &AU) const override {
670     AU.setPreservesCFG();
671     MachineFunctionPass::getAnalysisUsage(AU);
672   }
673 
674   StringRef getPassName() const override {
675     return PASS_NAME;
676   }
677 
678   bool runOnMachineFunction(MachineFunction &MF) override;
679 };
680 
681 } // end namespace anonymous
682 
683 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
684                                       const char *Msg) const {
685   const Function &Func = MI->getParent()->getParent()->getFunction();
686   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
687   Func.getContext().diagnose(Diag);
688 }
689 
690 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
691 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
692                                SIAtomicAddrSpace InstrAddrSpace) const {
693   if (SSID == SyncScope::System)
694     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
695   if (SSID == MMI->getAgentSSID())
696     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
697   if (SSID == MMI->getWorkgroupSSID())
698     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
699                       true);
700   if (SSID == MMI->getWavefrontSSID())
701     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
702                       true);
703   if (SSID == SyncScope::SingleThread)
704     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
705                       true);
706   if (SSID == MMI->getSystemOneAddressSpaceSSID())
707     return std::tuple(SIAtomicScope::SYSTEM,
708                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
709   if (SSID == MMI->getAgentOneAddressSpaceSSID())
710     return std::tuple(SIAtomicScope::AGENT,
711                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
712   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
713     return std::tuple(SIAtomicScope::WORKGROUP,
714                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
715   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
716     return std::tuple(SIAtomicScope::WAVEFRONT,
717                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
718   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
719     return std::tuple(SIAtomicScope::SINGLETHREAD,
720                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
721   return std::nullopt;
722 }
723 
724 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
725   if (AS == AMDGPUAS::FLAT_ADDRESS)
726     return SIAtomicAddrSpace::FLAT;
727   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
728     return SIAtomicAddrSpace::GLOBAL;
729   if (AS == AMDGPUAS::LOCAL_ADDRESS)
730     return SIAtomicAddrSpace::LDS;
731   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
732     return SIAtomicAddrSpace::SCRATCH;
733   if (AS == AMDGPUAS::REGION_ADDRESS)
734     return SIAtomicAddrSpace::GDS;
735 
736   return SIAtomicAddrSpace::OTHER;
737 }
738 
739 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
740   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
741 }
742 
743 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
744     const MachineBasicBlock::iterator &MI) const {
745   assert(MI->getNumMemOperands() > 0);
746 
747   SyncScope::ID SSID = SyncScope::SingleThread;
748   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
749   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
750   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
751   bool IsNonTemporal = true;
752   bool IsVolatile = false;
753   bool IsLastUse = false;
754 
755   // Validator should check whether or not MMOs cover the entire set of
756   // locations accessed by the memory instruction.
757   for (const auto &MMO : MI->memoperands()) {
758     IsNonTemporal &= MMO->isNonTemporal();
759     IsVolatile |= MMO->isVolatile();
760     IsLastUse |= MMO->getFlags() & MOLastUse;
761     InstrAddrSpace |=
762       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
763     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
764     if (OpOrdering != AtomicOrdering::NotAtomic) {
765       const auto &IsSyncScopeInclusion =
766           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
767       if (!IsSyncScopeInclusion) {
768         reportUnsupported(MI,
769           "Unsupported non-inclusive atomic synchronization scope");
770         return std::nullopt;
771       }
772 
773       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
774       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
775       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
776              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
777       FailureOrdering =
778           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
779     }
780   }
781 
782   SIAtomicScope Scope = SIAtomicScope::NONE;
783   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
784   bool IsCrossAddressSpaceOrdering = false;
785   if (Ordering != AtomicOrdering::NotAtomic) {
786     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
787     if (!ScopeOrNone) {
788       reportUnsupported(MI, "Unsupported atomic synchronization scope");
789       return std::nullopt;
790     }
791     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
792         *ScopeOrNone;
793     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
794         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
795         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
796       reportUnsupported(MI, "Unsupported atomic address space");
797       return std::nullopt;
798     }
799   }
800   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
801                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
802                      IsNonTemporal, IsLastUse);
803 }
804 
805 std::optional<SIMemOpInfo>
806 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
807   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
808 
809   if (!(MI->mayLoad() && !MI->mayStore()))
810     return std::nullopt;
811 
812   // Be conservative if there are no memory operands.
813   if (MI->getNumMemOperands() == 0)
814     return SIMemOpInfo();
815 
816   return constructFromMIWithMMO(MI);
817 }
818 
819 std::optional<SIMemOpInfo>
820 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
821   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
822 
823   if (!(!MI->mayLoad() && MI->mayStore()))
824     return std::nullopt;
825 
826   // Be conservative if there are no memory operands.
827   if (MI->getNumMemOperands() == 0)
828     return SIMemOpInfo();
829 
830   return constructFromMIWithMMO(MI);
831 }
832 
833 std::optional<SIMemOpInfo>
834 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
835   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
836 
837   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
838     return std::nullopt;
839 
840   AtomicOrdering Ordering =
841     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
842 
843   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
844   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
845   if (!ScopeOrNone) {
846     reportUnsupported(MI, "Unsupported atomic synchronization scope");
847     return std::nullopt;
848   }
849 
850   SIAtomicScope Scope = SIAtomicScope::NONE;
851   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
852   bool IsCrossAddressSpaceOrdering = false;
853   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
854       *ScopeOrNone;
855 
856   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
857       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
858     reportUnsupported(MI, "Unsupported atomic address space");
859     return std::nullopt;
860   }
861 
862   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
863                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
864 }
865 
866 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
867     const MachineBasicBlock::iterator &MI) const {
868   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
869 
870   if (!(MI->mayLoad() && MI->mayStore()))
871     return std::nullopt;
872 
873   // Be conservative if there are no memory operands.
874   if (MI->getNumMemOperands() == 0)
875     return SIMemOpInfo();
876 
877   return constructFromMIWithMMO(MI);
878 }
879 
880 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
881   TII = ST.getInstrInfo();
882   IV = getIsaVersion(ST.getCPU());
883   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
884 }
885 
886 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
887                                     AMDGPU::CPol::CPol Bit) const {
888   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
889   if (!CPol)
890     return false;
891 
892   CPol->setImm(CPol->getImm() | Bit);
893   return true;
894 }
895 
896 /* static */
897 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
898   GCNSubtarget::Generation Generation = ST.getGeneration();
899   if (ST.hasGFX940Insts())
900     return std::make_unique<SIGfx940CacheControl>(ST);
901   if (ST.hasGFX90AInsts())
902     return std::make_unique<SIGfx90ACacheControl>(ST);
903   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
904     return std::make_unique<SIGfx6CacheControl>(ST);
905   if (Generation < AMDGPUSubtarget::GFX10)
906     return std::make_unique<SIGfx7CacheControl>(ST);
907   if (Generation < AMDGPUSubtarget::GFX11)
908     return std::make_unique<SIGfx10CacheControl>(ST);
909   if (Generation < AMDGPUSubtarget::GFX12)
910     return std::make_unique<SIGfx11CacheControl>(ST);
911   return std::make_unique<SIGfx12CacheControl>(ST);
912 }
913 
914 bool SIGfx6CacheControl::enableLoadCacheBypass(
915     const MachineBasicBlock::iterator &MI,
916     SIAtomicScope Scope,
917     SIAtomicAddrSpace AddrSpace) const {
918   assert(MI->mayLoad() && !MI->mayStore());
919   bool Changed = false;
920 
921   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
922     switch (Scope) {
923     case SIAtomicScope::SYSTEM:
924     case SIAtomicScope::AGENT:
925       // Set L1 cache policy to MISS_EVICT.
926       // Note: there is no L2 cache bypass policy at the ISA level.
927       Changed |= enableGLCBit(MI);
928       break;
929     case SIAtomicScope::WORKGROUP:
930     case SIAtomicScope::WAVEFRONT:
931     case SIAtomicScope::SINGLETHREAD:
932       // No cache to bypass.
933       break;
934     default:
935       llvm_unreachable("Unsupported synchronization scope");
936     }
937   }
938 
939   /// The scratch address space does not need the global memory caches
940   /// to be bypassed as all memory operations by the same thread are
941   /// sequentially consistent, and no other thread can access scratch
942   /// memory.
943 
944   /// Other address spaces do not have a cache.
945 
946   return Changed;
947 }
948 
949 bool SIGfx6CacheControl::enableStoreCacheBypass(
950     const MachineBasicBlock::iterator &MI,
951     SIAtomicScope Scope,
952     SIAtomicAddrSpace AddrSpace) const {
953   assert(!MI->mayLoad() && MI->mayStore());
954   bool Changed = false;
955 
956   /// The L1 cache is write through so does not need to be bypassed. There is no
957   /// bypass control for the L2 cache at the isa level.
958 
959   return Changed;
960 }
961 
962 bool SIGfx6CacheControl::enableRMWCacheBypass(
963     const MachineBasicBlock::iterator &MI,
964     SIAtomicScope Scope,
965     SIAtomicAddrSpace AddrSpace) const {
966   assert(MI->mayLoad() && MI->mayStore());
967   bool Changed = false;
968 
969   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
970   /// bypassed, and the GLC bit is instead used to indicate if they are
971   /// return or no-return.
972   /// Note: there is no L2 cache coherent bypass control at the ISA level.
973 
974   return Changed;
975 }
976 
977 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
978     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
979     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
980   // Only handle load and store, not atomic read-modify-write insructions. The
981   // latter use glc to indicate if the atomic returns a result and so must not
982   // be used for cache control.
983   assert(MI->mayLoad() ^ MI->mayStore());
984 
985   // Only update load and store, not LLVM IR atomic read-modify-write
986   // instructions. The latter are always marked as volatile so cannot sensibly
987   // handle it as do not want to pessimize all atomics. Also they do not support
988   // the nontemporal attribute.
989   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
990 
991   bool Changed = false;
992 
993   if (IsVolatile) {
994     // Set L1 cache policy to be MISS_EVICT for load instructions
995     // and MISS_LRU for store instructions.
996     // Note: there is no L2 cache bypass policy at the ISA level.
997     if (Op == SIMemOp::LOAD)
998       Changed |= enableGLCBit(MI);
999 
1000     // Ensure operation has completed at system scope to cause all volatile
1001     // operations to be visible outside the program in a global order. Do not
1002     // request cross address space as only the global address space can be
1003     // observable outside the program, so no need to cause a waitcnt for LDS
1004     // address space operations.
1005     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1006                           Position::AFTER);
1007 
1008     return Changed;
1009   }
1010 
1011   if (IsNonTemporal) {
1012     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1013     // for both loads and stores, and the L2 cache policy to STREAM.
1014     Changed |= enableGLCBit(MI);
1015     Changed |= enableSLCBit(MI);
1016     return Changed;
1017   }
1018 
1019   return Changed;
1020 }
1021 
1022 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1023                                     SIAtomicScope Scope,
1024                                     SIAtomicAddrSpace AddrSpace,
1025                                     SIMemOp Op,
1026                                     bool IsCrossAddrSpaceOrdering,
1027                                     Position Pos) const {
1028   bool Changed = false;
1029 
1030   MachineBasicBlock &MBB = *MI->getParent();
1031   DebugLoc DL = MI->getDebugLoc();
1032 
1033   if (Pos == Position::AFTER)
1034     ++MI;
1035 
1036   bool VMCnt = false;
1037   bool LGKMCnt = false;
1038 
1039   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1040       SIAtomicAddrSpace::NONE) {
1041     switch (Scope) {
1042     case SIAtomicScope::SYSTEM:
1043     case SIAtomicScope::AGENT:
1044       VMCnt |= true;
1045       break;
1046     case SIAtomicScope::WORKGROUP:
1047     case SIAtomicScope::WAVEFRONT:
1048     case SIAtomicScope::SINGLETHREAD:
1049       // The L1 cache keeps all memory operations in order for
1050       // wavefronts in the same work-group.
1051       break;
1052     default:
1053       llvm_unreachable("Unsupported synchronization scope");
1054     }
1055   }
1056 
1057   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1058     switch (Scope) {
1059     case SIAtomicScope::SYSTEM:
1060     case SIAtomicScope::AGENT:
1061     case SIAtomicScope::WORKGROUP:
1062       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1063       // not needed as LDS operations for all waves are executed in a total
1064       // global ordering as observed by all waves. Required if also
1065       // synchronizing with global/GDS memory as LDS operations could be
1066       // reordered with respect to later global/GDS memory operations of the
1067       // same wave.
1068       LGKMCnt |= IsCrossAddrSpaceOrdering;
1069       break;
1070     case SIAtomicScope::WAVEFRONT:
1071     case SIAtomicScope::SINGLETHREAD:
1072       // The LDS keeps all memory operations in order for
1073       // the same wavefront.
1074       break;
1075     default:
1076       llvm_unreachable("Unsupported synchronization scope");
1077     }
1078   }
1079 
1080   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1081     switch (Scope) {
1082     case SIAtomicScope::SYSTEM:
1083     case SIAtomicScope::AGENT:
1084       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1085       // is not needed as GDS operations for all waves are executed in a total
1086       // global ordering as observed by all waves. Required if also
1087       // synchronizing with global/LDS memory as GDS operations could be
1088       // reordered with respect to later global/LDS memory operations of the
1089       // same wave.
1090       LGKMCnt |= IsCrossAddrSpaceOrdering;
1091       break;
1092     case SIAtomicScope::WORKGROUP:
1093     case SIAtomicScope::WAVEFRONT:
1094     case SIAtomicScope::SINGLETHREAD:
1095       // The GDS keeps all memory operations in order for
1096       // the same work-group.
1097       break;
1098     default:
1099       llvm_unreachable("Unsupported synchronization scope");
1100     }
1101   }
1102 
1103   if (VMCnt || LGKMCnt) {
1104     unsigned WaitCntImmediate =
1105       AMDGPU::encodeWaitcnt(IV,
1106                             VMCnt ? 0 : getVmcntBitMask(IV),
1107                             getExpcntBitMask(IV),
1108                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1109     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1110         .addImm(WaitCntImmediate);
1111     Changed = true;
1112   }
1113 
1114   if (Pos == Position::AFTER)
1115     --MI;
1116 
1117   return Changed;
1118 }
1119 
1120 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1121                                        SIAtomicScope Scope,
1122                                        SIAtomicAddrSpace AddrSpace,
1123                                        Position Pos) const {
1124   if (!InsertCacheInv)
1125     return false;
1126 
1127   bool Changed = false;
1128 
1129   MachineBasicBlock &MBB = *MI->getParent();
1130   DebugLoc DL = MI->getDebugLoc();
1131 
1132   if (Pos == Position::AFTER)
1133     ++MI;
1134 
1135   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1136     switch (Scope) {
1137     case SIAtomicScope::SYSTEM:
1138     case SIAtomicScope::AGENT:
1139       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1140       Changed = true;
1141       break;
1142     case SIAtomicScope::WORKGROUP:
1143     case SIAtomicScope::WAVEFRONT:
1144     case SIAtomicScope::SINGLETHREAD:
1145       // No cache to invalidate.
1146       break;
1147     default:
1148       llvm_unreachable("Unsupported synchronization scope");
1149     }
1150   }
1151 
1152   /// The scratch address space does not need the global memory cache
1153   /// to be flushed as all memory operations by the same thread are
1154   /// sequentially consistent, and no other thread can access scratch
1155   /// memory.
1156 
1157   /// Other address spaces do not have a cache.
1158 
1159   if (Pos == Position::AFTER)
1160     --MI;
1161 
1162   return Changed;
1163 }
1164 
1165 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1166                                        SIAtomicScope Scope,
1167                                        SIAtomicAddrSpace AddrSpace,
1168                                        bool IsCrossAddrSpaceOrdering,
1169                                        Position Pos) const {
1170   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1171                     IsCrossAddrSpaceOrdering, Pos);
1172 }
1173 
1174 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1175                                        SIAtomicScope Scope,
1176                                        SIAtomicAddrSpace AddrSpace,
1177                                        Position Pos) const {
1178   if (!InsertCacheInv)
1179     return false;
1180 
1181   bool Changed = false;
1182 
1183   MachineBasicBlock &MBB = *MI->getParent();
1184   DebugLoc DL = MI->getDebugLoc();
1185 
1186   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1187 
1188   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1189                                     ? AMDGPU::BUFFER_WBINVL1
1190                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1191 
1192   if (Pos == Position::AFTER)
1193     ++MI;
1194 
1195   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1196     switch (Scope) {
1197     case SIAtomicScope::SYSTEM:
1198     case SIAtomicScope::AGENT:
1199       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1200       Changed = true;
1201       break;
1202     case SIAtomicScope::WORKGROUP:
1203     case SIAtomicScope::WAVEFRONT:
1204     case SIAtomicScope::SINGLETHREAD:
1205       // No cache to invalidate.
1206       break;
1207     default:
1208       llvm_unreachable("Unsupported synchronization scope");
1209     }
1210   }
1211 
1212   /// The scratch address space does not need the global memory cache
1213   /// to be flushed as all memory operations by the same thread are
1214   /// sequentially consistent, and no other thread can access scratch
1215   /// memory.
1216 
1217   /// Other address spaces do not have a cache.
1218 
1219   if (Pos == Position::AFTER)
1220     --MI;
1221 
1222   return Changed;
1223 }
1224 
1225 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1226     const MachineBasicBlock::iterator &MI,
1227     SIAtomicScope Scope,
1228     SIAtomicAddrSpace AddrSpace) const {
1229   assert(MI->mayLoad() && !MI->mayStore());
1230   bool Changed = false;
1231 
1232   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1233     switch (Scope) {
1234     case SIAtomicScope::SYSTEM:
1235     case SIAtomicScope::AGENT:
1236       // Set the L1 cache policy to MISS_LRU.
1237       // Note: there is no L2 cache bypass policy at the ISA level.
1238       Changed |= enableGLCBit(MI);
1239       break;
1240     case SIAtomicScope::WORKGROUP:
1241       // In threadgroup split mode the waves of a work-group can be executing on
1242       // different CUs. Therefore need to bypass the L1 which is per CU.
1243       // Otherwise in non-threadgroup split mode all waves of a work-group are
1244       // on the same CU, and so the L1 does not need to be bypassed.
1245       if (ST.isTgSplitEnabled())
1246         Changed |= enableGLCBit(MI);
1247       break;
1248     case SIAtomicScope::WAVEFRONT:
1249     case SIAtomicScope::SINGLETHREAD:
1250       // No cache to bypass.
1251       break;
1252     default:
1253       llvm_unreachable("Unsupported synchronization scope");
1254     }
1255   }
1256 
1257   /// The scratch address space does not need the global memory caches
1258   /// to be bypassed as all memory operations by the same thread are
1259   /// sequentially consistent, and no other thread can access scratch
1260   /// memory.
1261 
1262   /// Other address spaces do not have a cache.
1263 
1264   return Changed;
1265 }
1266 
1267 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1268     const MachineBasicBlock::iterator &MI,
1269     SIAtomicScope Scope,
1270     SIAtomicAddrSpace AddrSpace) const {
1271   assert(!MI->mayLoad() && MI->mayStore());
1272   bool Changed = false;
1273 
1274   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1275     switch (Scope) {
1276     case SIAtomicScope::SYSTEM:
1277     case SIAtomicScope::AGENT:
1278       /// Do not set glc for store atomic operations as they implicitly write
1279       /// through the L1 cache.
1280       break;
1281     case SIAtomicScope::WORKGROUP:
1282     case SIAtomicScope::WAVEFRONT:
1283     case SIAtomicScope::SINGLETHREAD:
1284       // No cache to bypass. Store atomics implicitly write through the L1
1285       // cache.
1286       break;
1287     default:
1288       llvm_unreachable("Unsupported synchronization scope");
1289     }
1290   }
1291 
1292   /// The scratch address space does not need the global memory caches
1293   /// to be bypassed as all memory operations by the same thread are
1294   /// sequentially consistent, and no other thread can access scratch
1295   /// memory.
1296 
1297   /// Other address spaces do not have a cache.
1298 
1299   return Changed;
1300 }
1301 
1302 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1303     const MachineBasicBlock::iterator &MI,
1304     SIAtomicScope Scope,
1305     SIAtomicAddrSpace AddrSpace) const {
1306   assert(MI->mayLoad() && MI->mayStore());
1307   bool Changed = false;
1308 
1309   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1310     switch (Scope) {
1311     case SIAtomicScope::SYSTEM:
1312     case SIAtomicScope::AGENT:
1313       /// Do not set glc for RMW atomic operations as they implicitly bypass
1314       /// the L1 cache, and the glc bit is instead used to indicate if they are
1315       /// return or no-return.
1316       break;
1317     case SIAtomicScope::WORKGROUP:
1318     case SIAtomicScope::WAVEFRONT:
1319     case SIAtomicScope::SINGLETHREAD:
1320       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1321       break;
1322     default:
1323       llvm_unreachable("Unsupported synchronization scope");
1324     }
1325   }
1326 
1327   return Changed;
1328 }
1329 
1330 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1331     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1332     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1333   // Only handle load and store, not atomic read-modify-write insructions. The
1334   // latter use glc to indicate if the atomic returns a result and so must not
1335   // be used for cache control.
1336   assert(MI->mayLoad() ^ MI->mayStore());
1337 
1338   // Only update load and store, not LLVM IR atomic read-modify-write
1339   // instructions. The latter are always marked as volatile so cannot sensibly
1340   // handle it as do not want to pessimize all atomics. Also they do not support
1341   // the nontemporal attribute.
1342   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1343 
1344   bool Changed = false;
1345 
1346   if (IsVolatile) {
1347     // Set L1 cache policy to be MISS_EVICT for load instructions
1348     // and MISS_LRU for store instructions.
1349     // Note: there is no L2 cache bypass policy at the ISA level.
1350     if (Op == SIMemOp::LOAD)
1351       Changed |= enableGLCBit(MI);
1352 
1353     // Ensure operation has completed at system scope to cause all volatile
1354     // operations to be visible outside the program in a global order. Do not
1355     // request cross address space as only the global address space can be
1356     // observable outside the program, so no need to cause a waitcnt for LDS
1357     // address space operations.
1358     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1359                           Position::AFTER);
1360 
1361     return Changed;
1362   }
1363 
1364   if (IsNonTemporal) {
1365     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1366     // for both loads and stores, and the L2 cache policy to STREAM.
1367     Changed |= enableGLCBit(MI);
1368     Changed |= enableSLCBit(MI);
1369     return Changed;
1370   }
1371 
1372   return Changed;
1373 }
1374 
1375 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1376                                       SIAtomicScope Scope,
1377                                       SIAtomicAddrSpace AddrSpace,
1378                                       SIMemOp Op,
1379                                       bool IsCrossAddrSpaceOrdering,
1380                                       Position Pos) const {
1381   if (ST.isTgSplitEnabled()) {
1382     // In threadgroup split mode the waves of a work-group can be executing on
1383     // different CUs. Therefore need to wait for global or GDS memory operations
1384     // to complete to ensure they are visible to waves in the other CUs.
1385     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1386     // the same CU, so no need to wait for global memory as all waves in the
1387     // work-group access the same the L1, nor wait for GDS as access are ordered
1388     // on a CU.
1389     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1390                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1391         (Scope == SIAtomicScope::WORKGROUP)) {
1392       // Same as GFX7 using agent scope.
1393       Scope = SIAtomicScope::AGENT;
1394     }
1395     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1396     // LDS memory operations.
1397     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1398   }
1399   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1400                                         IsCrossAddrSpaceOrdering, Pos);
1401 }
1402 
1403 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1404                                          SIAtomicScope Scope,
1405                                          SIAtomicAddrSpace AddrSpace,
1406                                          Position Pos) const {
1407   if (!InsertCacheInv)
1408     return false;
1409 
1410   bool Changed = false;
1411 
1412   MachineBasicBlock &MBB = *MI->getParent();
1413   DebugLoc DL = MI->getDebugLoc();
1414 
1415   if (Pos == Position::AFTER)
1416     ++MI;
1417 
1418   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1419     switch (Scope) {
1420     case SIAtomicScope::SYSTEM:
1421       // Ensures that following loads will not see stale remote VMEM data or
1422       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1423       // CC will never be stale due to the local memory probes.
1424       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1425       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1426       // hardware does not reorder memory operations by the same wave with
1427       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1428       // remove any cache lines of earlier writes by the same wave and ensures
1429       // later reads by the same wave will refetch the cache lines.
1430       Changed = true;
1431       break;
1432     case SIAtomicScope::AGENT:
1433       // Same as GFX7.
1434       break;
1435     case SIAtomicScope::WORKGROUP:
1436       // In threadgroup split mode the waves of a work-group can be executing on
1437       // different CUs. Therefore need to invalidate the L1 which is per CU.
1438       // Otherwise in non-threadgroup split mode all waves of a work-group are
1439       // on the same CU, and so the L1 does not need to be invalidated.
1440       if (ST.isTgSplitEnabled()) {
1441         // Same as GFX7 using agent scope.
1442         Scope = SIAtomicScope::AGENT;
1443       }
1444       break;
1445     case SIAtomicScope::WAVEFRONT:
1446     case SIAtomicScope::SINGLETHREAD:
1447       // Same as GFX7.
1448       break;
1449     default:
1450       llvm_unreachable("Unsupported synchronization scope");
1451     }
1452   }
1453 
1454   /// The scratch address space does not need the global memory cache
1455   /// to be flushed as all memory operations by the same thread are
1456   /// sequentially consistent, and no other thread can access scratch
1457   /// memory.
1458 
1459   /// Other address spaces do not have a cache.
1460 
1461   if (Pos == Position::AFTER)
1462     --MI;
1463 
1464   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1465 
1466   return Changed;
1467 }
1468 
1469 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1470                                          SIAtomicScope Scope,
1471                                          SIAtomicAddrSpace AddrSpace,
1472                                          bool IsCrossAddrSpaceOrdering,
1473                                          Position Pos) const {
1474   bool Changed = false;
1475 
1476   MachineBasicBlock &MBB = *MI->getParent();
1477   const DebugLoc &DL = MI->getDebugLoc();
1478 
1479   if (Pos == Position::AFTER)
1480     ++MI;
1481 
1482   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1483     switch (Scope) {
1484     case SIAtomicScope::SYSTEM:
1485       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1486       // hardware does not reorder memory operations by the same wave with
1487       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1488       // to initiate writeback of any dirty cache lines of earlier writes by the
1489       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1490       // writeback has completed.
1491       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1492         // Set SC bits to indicate system scope.
1493         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1494       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1495       // vmcnt(0)" needed by the "BUFFER_WBL2".
1496       Changed = true;
1497       break;
1498     case SIAtomicScope::AGENT:
1499     case SIAtomicScope::WORKGROUP:
1500     case SIAtomicScope::WAVEFRONT:
1501     case SIAtomicScope::SINGLETHREAD:
1502       // Same as GFX7.
1503       break;
1504     default:
1505       llvm_unreachable("Unsupported synchronization scope");
1506     }
1507   }
1508 
1509   if (Pos == Position::AFTER)
1510     --MI;
1511 
1512   Changed |=
1513       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1514                                         IsCrossAddrSpaceOrdering, Pos);
1515 
1516   return Changed;
1517 }
1518 
1519 bool SIGfx940CacheControl::enableLoadCacheBypass(
1520     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1521     SIAtomicAddrSpace AddrSpace) const {
1522   assert(MI->mayLoad() && !MI->mayStore());
1523   bool Changed = false;
1524 
1525   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1526     switch (Scope) {
1527     case SIAtomicScope::SYSTEM:
1528       // Set SC bits to indicate system scope.
1529       Changed |= enableSC0Bit(MI);
1530       Changed |= enableSC1Bit(MI);
1531       break;
1532     case SIAtomicScope::AGENT:
1533       // Set SC bits to indicate agent scope.
1534       Changed |= enableSC1Bit(MI);
1535       break;
1536     case SIAtomicScope::WORKGROUP:
1537       // In threadgroup split mode the waves of a work-group can be executing on
1538       // different CUs. Therefore need to bypass the L1 which is per CU.
1539       // Otherwise in non-threadgroup split mode all waves of a work-group are
1540       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1541       // bits to indicate work-group scope will do this automatically.
1542       Changed |= enableSC0Bit(MI);
1543       break;
1544     case SIAtomicScope::WAVEFRONT:
1545     case SIAtomicScope::SINGLETHREAD:
1546       // Leave SC bits unset to indicate wavefront scope.
1547       break;
1548     default:
1549       llvm_unreachable("Unsupported synchronization scope");
1550     }
1551   }
1552 
1553   /// The scratch address space does not need the global memory caches
1554   /// to be bypassed as all memory operations by the same thread are
1555   /// sequentially consistent, and no other thread can access scratch
1556   /// memory.
1557 
1558   /// Other address spaces do not have a cache.
1559 
1560   return Changed;
1561 }
1562 
1563 bool SIGfx940CacheControl::enableStoreCacheBypass(
1564     const MachineBasicBlock::iterator &MI,
1565     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1566   assert(!MI->mayLoad() && MI->mayStore());
1567   bool Changed = false;
1568 
1569   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1570     switch (Scope) {
1571     case SIAtomicScope::SYSTEM:
1572       // Set SC bits to indicate system scope.
1573       Changed |= enableSC0Bit(MI);
1574       Changed |= enableSC1Bit(MI);
1575       break;
1576     case SIAtomicScope::AGENT:
1577       // Set SC bits to indicate agent scope.
1578       Changed |= enableSC1Bit(MI);
1579       break;
1580     case SIAtomicScope::WORKGROUP:
1581       // Set SC bits to indicate workgroup scope.
1582       Changed |= enableSC0Bit(MI);
1583       break;
1584     case SIAtomicScope::WAVEFRONT:
1585     case SIAtomicScope::SINGLETHREAD:
1586       // Leave SC bits unset to indicate wavefront scope.
1587       break;
1588     default:
1589       llvm_unreachable("Unsupported synchronization scope");
1590     }
1591   }
1592 
1593   /// The scratch address space does not need the global memory caches
1594   /// to be bypassed as all memory operations by the same thread are
1595   /// sequentially consistent, and no other thread can access scratch
1596   /// memory.
1597 
1598   /// Other address spaces do not have a cache.
1599 
1600   return Changed;
1601 }
1602 
1603 bool SIGfx940CacheControl::enableRMWCacheBypass(
1604     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1605     SIAtomicAddrSpace AddrSpace) const {
1606   assert(MI->mayLoad() && MI->mayStore());
1607   bool Changed = false;
1608 
1609   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1610     switch (Scope) {
1611     case SIAtomicScope::SYSTEM:
1612       // Set SC1 bit to indicate system scope.
1613       Changed |= enableSC1Bit(MI);
1614       break;
1615     case SIAtomicScope::AGENT:
1616     case SIAtomicScope::WORKGROUP:
1617     case SIAtomicScope::WAVEFRONT:
1618     case SIAtomicScope::SINGLETHREAD:
1619       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1620       // to indicate system or agent scope. The SC0 bit is used to indicate if
1621       // they are return or no-return. Leave SC1 bit unset to indicate agent
1622       // scope.
1623       break;
1624     default:
1625       llvm_unreachable("Unsupported synchronization scope");
1626     }
1627   }
1628 
1629   return Changed;
1630 }
1631 
1632 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1633     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1634     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1635   // Only handle load and store, not atomic read-modify-write insructions. The
1636   // latter use glc to indicate if the atomic returns a result and so must not
1637   // be used for cache control.
1638   assert(MI->mayLoad() ^ MI->mayStore());
1639 
1640   // Only update load and store, not LLVM IR atomic read-modify-write
1641   // instructions. The latter are always marked as volatile so cannot sensibly
1642   // handle it as do not want to pessimize all atomics. Also they do not support
1643   // the nontemporal attribute.
1644   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1645 
1646   bool Changed = false;
1647 
1648   if (IsVolatile) {
1649     // Set SC bits to indicate system scope.
1650     Changed |= enableSC0Bit(MI);
1651     Changed |= enableSC1Bit(MI);
1652 
1653     // Ensure operation has completed at system scope to cause all volatile
1654     // operations to be visible outside the program in a global order. Do not
1655     // request cross address space as only the global address space can be
1656     // observable outside the program, so no need to cause a waitcnt for LDS
1657     // address space operations.
1658     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1659                           Position::AFTER);
1660 
1661     return Changed;
1662   }
1663 
1664   if (IsNonTemporal) {
1665     Changed |= enableNTBit(MI);
1666     return Changed;
1667   }
1668 
1669   return Changed;
1670 }
1671 
1672 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1673                                          SIAtomicScope Scope,
1674                                          SIAtomicAddrSpace AddrSpace,
1675                                          Position Pos) const {
1676   if (!InsertCacheInv)
1677     return false;
1678 
1679   bool Changed = false;
1680 
1681   MachineBasicBlock &MBB = *MI->getParent();
1682   DebugLoc DL = MI->getDebugLoc();
1683 
1684   if (Pos == Position::AFTER)
1685     ++MI;
1686 
1687   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1688     switch (Scope) {
1689     case SIAtomicScope::SYSTEM:
1690       // Ensures that following loads will not see stale remote VMEM data or
1691       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1692       // CC will never be stale due to the local memory probes.
1693       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1694           // Set SC bits to indicate system scope.
1695           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1696       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1697       // hardware does not reorder memory operations by the same wave with
1698       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1699       // remove any cache lines of earlier writes by the same wave and ensures
1700       // later reads by the same wave will refetch the cache lines.
1701       Changed = true;
1702       break;
1703     case SIAtomicScope::AGENT:
1704       // Ensures that following loads will not see stale remote date or local
1705       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1706       // due to the memory probes.
1707       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1708           // Set SC bits to indicate agent scope.
1709           .addImm(AMDGPU::CPol::SC1);
1710       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1711       // does not reorder memory operations with respect to preceeding buffer
1712       // invalidate. The invalidate is guaranteed to remove any cache lines of
1713       // earlier writes and ensures later writes will refetch the cache lines.
1714       Changed = true;
1715       break;
1716     case SIAtomicScope::WORKGROUP:
1717       // In threadgroup split mode the waves of a work-group can be executing on
1718       // different CUs. Therefore need to invalidate the L1 which is per CU.
1719       // Otherwise in non-threadgroup split mode all waves of a work-group are
1720       // on the same CU, and so the L1 does not need to be invalidated.
1721       if (ST.isTgSplitEnabled()) {
1722         // Ensures L1 is invalidated if in threadgroup split mode. In
1723         // non-threadgroup split mode it is a NOP, but no point generating it in
1724         // that case if know not in that mode.
1725         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1726             // Set SC bits to indicate work-group scope.
1727             .addImm(AMDGPU::CPol::SC0);
1728         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1729         // does not reorder memory operations with respect to preceeding buffer
1730         // invalidate. The invalidate is guaranteed to remove any cache lines of
1731         // earlier writes and ensures later writes will refetch the cache lines.
1732         Changed = true;
1733       }
1734       break;
1735     case SIAtomicScope::WAVEFRONT:
1736     case SIAtomicScope::SINGLETHREAD:
1737       // Could generate "BUFFER_INV" but it would do nothing as there are no
1738       // caches to invalidate.
1739       break;
1740     default:
1741       llvm_unreachable("Unsupported synchronization scope");
1742     }
1743   }
1744 
1745   /// The scratch address space does not need the global memory cache
1746   /// to be flushed as all memory operations by the same thread are
1747   /// sequentially consistent, and no other thread can access scratch
1748   /// memory.
1749 
1750   /// Other address spaces do not have a cache.
1751 
1752   if (Pos == Position::AFTER)
1753     --MI;
1754 
1755   return Changed;
1756 }
1757 
1758 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1759                                          SIAtomicScope Scope,
1760                                          SIAtomicAddrSpace AddrSpace,
1761                                          bool IsCrossAddrSpaceOrdering,
1762                                          Position Pos) const {
1763   bool Changed = false;
1764 
1765   MachineBasicBlock &MBB = *MI->getParent();
1766   DebugLoc DL = MI->getDebugLoc();
1767 
1768   if (Pos == Position::AFTER)
1769     ++MI;
1770 
1771   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1772     switch (Scope) {
1773     case SIAtomicScope::SYSTEM:
1774       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1775       // hardware does not reorder memory operations by the same wave with
1776       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1777       // to initiate writeback of any dirty cache lines of earlier writes by the
1778       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1779       // writeback has completed.
1780       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1781           // Set SC bits to indicate system scope.
1782           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1783       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1784       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1785       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1786       Changed = true;
1787       break;
1788     case SIAtomicScope::AGENT:
1789       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1790           // Set SC bits to indicate agent scope.
1791           .addImm(AMDGPU::CPol::SC1);
1792 
1793       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1794       // SIAtomicScope::AGENT, the following insertWait will generate the
1795       // required "S_WAITCNT vmcnt(0)".
1796       Changed = true;
1797       break;
1798     case SIAtomicScope::WORKGROUP:
1799     case SIAtomicScope::WAVEFRONT:
1800     case SIAtomicScope::SINGLETHREAD:
1801       // Do not generate "BUFFER_WBL2" as there are no caches it would
1802       // writeback, and would require an otherwise unnecessary
1803       // "S_WAITCNT vmcnt(0)".
1804       break;
1805     default:
1806       llvm_unreachable("Unsupported synchronization scope");
1807     }
1808   }
1809 
1810   if (Pos == Position::AFTER)
1811     --MI;
1812 
1813   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1814   // S_WAITCNT needed.
1815   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1816                         IsCrossAddrSpaceOrdering, Pos);
1817 
1818   return Changed;
1819 }
1820 
1821 bool SIGfx10CacheControl::enableLoadCacheBypass(
1822     const MachineBasicBlock::iterator &MI,
1823     SIAtomicScope Scope,
1824     SIAtomicAddrSpace AddrSpace) const {
1825   assert(MI->mayLoad() && !MI->mayStore());
1826   bool Changed = false;
1827 
1828   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1829     switch (Scope) {
1830     case SIAtomicScope::SYSTEM:
1831     case SIAtomicScope::AGENT:
1832       // Set the L0 and L1 cache policies to MISS_EVICT.
1833       // Note: there is no L2 cache coherent bypass control at the ISA level.
1834       Changed |= enableGLCBit(MI);
1835       Changed |= enableDLCBit(MI);
1836       break;
1837     case SIAtomicScope::WORKGROUP:
1838       // In WGP mode the waves of a work-group can be executing on either CU of
1839       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1840       // CU mode all waves of a work-group are on the same CU, and so the L0
1841       // does not need to be bypassed.
1842       if (!ST.isCuModeEnabled())
1843         Changed |= enableGLCBit(MI);
1844       break;
1845     case SIAtomicScope::WAVEFRONT:
1846     case SIAtomicScope::SINGLETHREAD:
1847       // No cache to bypass.
1848       break;
1849     default:
1850       llvm_unreachable("Unsupported synchronization scope");
1851     }
1852   }
1853 
1854   /// The scratch address space does not need the global memory caches
1855   /// to be bypassed as all memory operations by the same thread are
1856   /// sequentially consistent, and no other thread can access scratch
1857   /// memory.
1858 
1859   /// Other address spaces do not have a cache.
1860 
1861   return Changed;
1862 }
1863 
1864 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1865     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1866     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1867 
1868   // Only handle load and store, not atomic read-modify-write insructions. The
1869   // latter use glc to indicate if the atomic returns a result and so must not
1870   // be used for cache control.
1871   assert(MI->mayLoad() ^ MI->mayStore());
1872 
1873   // Only update load and store, not LLVM IR atomic read-modify-write
1874   // instructions. The latter are always marked as volatile so cannot sensibly
1875   // handle it as do not want to pessimize all atomics. Also they do not support
1876   // the nontemporal attribute.
1877   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1878 
1879   bool Changed = false;
1880 
1881   if (IsVolatile) {
1882     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1883     // and MISS_LRU for store instructions.
1884     // Note: there is no L2 cache coherent bypass control at the ISA level.
1885     if (Op == SIMemOp::LOAD) {
1886       Changed |= enableGLCBit(MI);
1887       Changed |= enableDLCBit(MI);
1888     }
1889 
1890     // Ensure operation has completed at system scope to cause all volatile
1891     // operations to be visible outside the program in a global order. Do not
1892     // request cross address space as only the global address space can be
1893     // observable outside the program, so no need to cause a waitcnt for LDS
1894     // address space operations.
1895     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1896                           Position::AFTER);
1897     return Changed;
1898   }
1899 
1900   if (IsNonTemporal) {
1901     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1902     // and L2 cache policy to STREAM.
1903     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1904     // to MISS_EVICT and the L2 cache policy to STREAM.
1905     if (Op == SIMemOp::STORE)
1906       Changed |= enableGLCBit(MI);
1907     Changed |= enableSLCBit(MI);
1908 
1909     return Changed;
1910   }
1911 
1912   return Changed;
1913 }
1914 
1915 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1916                                      SIAtomicScope Scope,
1917                                      SIAtomicAddrSpace AddrSpace,
1918                                      SIMemOp Op,
1919                                      bool IsCrossAddrSpaceOrdering,
1920                                      Position Pos) const {
1921   bool Changed = false;
1922 
1923   MachineBasicBlock &MBB = *MI->getParent();
1924   DebugLoc DL = MI->getDebugLoc();
1925 
1926   if (Pos == Position::AFTER)
1927     ++MI;
1928 
1929   bool VMCnt = false;
1930   bool VSCnt = false;
1931   bool LGKMCnt = false;
1932 
1933   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1934       SIAtomicAddrSpace::NONE) {
1935     switch (Scope) {
1936     case SIAtomicScope::SYSTEM:
1937     case SIAtomicScope::AGENT:
1938       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1939         VMCnt |= true;
1940       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1941         VSCnt |= true;
1942       break;
1943     case SIAtomicScope::WORKGROUP:
1944       // In WGP mode the waves of a work-group can be executing on either CU of
1945       // the WGP. Therefore need to wait for operations to complete to ensure
1946       // they are visible to waves in the other CU as the L0 is per CU.
1947       // Otherwise in CU mode and all waves of a work-group are on the same CU
1948       // which shares the same L0.
1949       if (!ST.isCuModeEnabled()) {
1950         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1951           VMCnt |= true;
1952         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1953           VSCnt |= true;
1954       }
1955       break;
1956     case SIAtomicScope::WAVEFRONT:
1957     case SIAtomicScope::SINGLETHREAD:
1958       // The L0 cache keeps all memory operations in order for
1959       // work-items in the same wavefront.
1960       break;
1961     default:
1962       llvm_unreachable("Unsupported synchronization scope");
1963     }
1964   }
1965 
1966   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1967     switch (Scope) {
1968     case SIAtomicScope::SYSTEM:
1969     case SIAtomicScope::AGENT:
1970     case SIAtomicScope::WORKGROUP:
1971       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1972       // not needed as LDS operations for all waves are executed in a total
1973       // global ordering as observed by all waves. Required if also
1974       // synchronizing with global/GDS memory as LDS operations could be
1975       // reordered with respect to later global/GDS memory operations of the
1976       // same wave.
1977       LGKMCnt |= IsCrossAddrSpaceOrdering;
1978       break;
1979     case SIAtomicScope::WAVEFRONT:
1980     case SIAtomicScope::SINGLETHREAD:
1981       // The LDS keeps all memory operations in order for
1982       // the same wavefront.
1983       break;
1984     default:
1985       llvm_unreachable("Unsupported synchronization scope");
1986     }
1987   }
1988 
1989   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1990     switch (Scope) {
1991     case SIAtomicScope::SYSTEM:
1992     case SIAtomicScope::AGENT:
1993       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1994       // is not needed as GDS operations for all waves are executed in a total
1995       // global ordering as observed by all waves. Required if also
1996       // synchronizing with global/LDS memory as GDS operations could be
1997       // reordered with respect to later global/LDS memory operations of the
1998       // same wave.
1999       LGKMCnt |= IsCrossAddrSpaceOrdering;
2000       break;
2001     case SIAtomicScope::WORKGROUP:
2002     case SIAtomicScope::WAVEFRONT:
2003     case SIAtomicScope::SINGLETHREAD:
2004       // The GDS keeps all memory operations in order for
2005       // the same work-group.
2006       break;
2007     default:
2008       llvm_unreachable("Unsupported synchronization scope");
2009     }
2010   }
2011 
2012   if (VMCnt || LGKMCnt) {
2013     unsigned WaitCntImmediate =
2014       AMDGPU::encodeWaitcnt(IV,
2015                             VMCnt ? 0 : getVmcntBitMask(IV),
2016                             getExpcntBitMask(IV),
2017                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2018     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2019         .addImm(WaitCntImmediate);
2020     Changed = true;
2021   }
2022 
2023   if (VSCnt) {
2024     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2025         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2026         .addImm(0);
2027     Changed = true;
2028   }
2029 
2030   if (Pos == Position::AFTER)
2031     --MI;
2032 
2033   return Changed;
2034 }
2035 
2036 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2037                                         SIAtomicScope Scope,
2038                                         SIAtomicAddrSpace AddrSpace,
2039                                         Position Pos) const {
2040   if (!InsertCacheInv)
2041     return false;
2042 
2043   bool Changed = false;
2044 
2045   MachineBasicBlock &MBB = *MI->getParent();
2046   DebugLoc DL = MI->getDebugLoc();
2047 
2048   if (Pos == Position::AFTER)
2049     ++MI;
2050 
2051   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2052     switch (Scope) {
2053     case SIAtomicScope::SYSTEM:
2054     case SIAtomicScope::AGENT:
2055       // The order of invalidates matter here. We must invalidate "outer in"
2056       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2057       // invalidated.
2058       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2059       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2060       Changed = true;
2061       break;
2062     case SIAtomicScope::WORKGROUP:
2063       // In WGP mode the waves of a work-group can be executing on either CU of
2064       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2065       // in CU mode and all waves of a work-group are on the same CU, and so the
2066       // L0 does not need to be invalidated.
2067       if (!ST.isCuModeEnabled()) {
2068         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2069         Changed = true;
2070       }
2071       break;
2072     case SIAtomicScope::WAVEFRONT:
2073     case SIAtomicScope::SINGLETHREAD:
2074       // No cache to invalidate.
2075       break;
2076     default:
2077       llvm_unreachable("Unsupported synchronization scope");
2078     }
2079   }
2080 
2081   /// The scratch address space does not need the global memory cache
2082   /// to be flushed as all memory operations by the same thread are
2083   /// sequentially consistent, and no other thread can access scratch
2084   /// memory.
2085 
2086   /// Other address spaces do not have a cache.
2087 
2088   if (Pos == Position::AFTER)
2089     --MI;
2090 
2091   return Changed;
2092 }
2093 
2094 bool SIGfx11CacheControl::enableLoadCacheBypass(
2095     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2096     SIAtomicAddrSpace AddrSpace) const {
2097   assert(MI->mayLoad() && !MI->mayStore());
2098   bool Changed = false;
2099 
2100   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2101     switch (Scope) {
2102     case SIAtomicScope::SYSTEM:
2103     case SIAtomicScope::AGENT:
2104       // Set the L0 and L1 cache policies to MISS_EVICT.
2105       // Note: there is no L2 cache coherent bypass control at the ISA level.
2106       Changed |= enableGLCBit(MI);
2107       break;
2108     case SIAtomicScope::WORKGROUP:
2109       // In WGP mode the waves of a work-group can be executing on either CU of
2110       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2111       // CU mode all waves of a work-group are on the same CU, and so the L0
2112       // does not need to be bypassed.
2113       if (!ST.isCuModeEnabled())
2114         Changed |= enableGLCBit(MI);
2115       break;
2116     case SIAtomicScope::WAVEFRONT:
2117     case SIAtomicScope::SINGLETHREAD:
2118       // No cache to bypass.
2119       break;
2120     default:
2121       llvm_unreachable("Unsupported synchronization scope");
2122     }
2123   }
2124 
2125   /// The scratch address space does not need the global memory caches
2126   /// to be bypassed as all memory operations by the same thread are
2127   /// sequentially consistent, and no other thread can access scratch
2128   /// memory.
2129 
2130   /// Other address spaces do not have a cache.
2131 
2132   return Changed;
2133 }
2134 
2135 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2136     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2137     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2138 
2139   // Only handle load and store, not atomic read-modify-write insructions. The
2140   // latter use glc to indicate if the atomic returns a result and so must not
2141   // be used for cache control.
2142   assert(MI->mayLoad() ^ MI->mayStore());
2143 
2144   // Only update load and store, not LLVM IR atomic read-modify-write
2145   // instructions. The latter are always marked as volatile so cannot sensibly
2146   // handle it as do not want to pessimize all atomics. Also they do not support
2147   // the nontemporal attribute.
2148   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2149 
2150   bool Changed = false;
2151 
2152   if (IsVolatile) {
2153     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2154     // and MISS_LRU for store instructions.
2155     // Note: there is no L2 cache coherent bypass control at the ISA level.
2156     if (Op == SIMemOp::LOAD)
2157       Changed |= enableGLCBit(MI);
2158 
2159     // Set MALL NOALLOC for load and store instructions.
2160     Changed |= enableDLCBit(MI);
2161 
2162     // Ensure operation has completed at system scope to cause all volatile
2163     // operations to be visible outside the program in a global order. Do not
2164     // request cross address space as only the global address space can be
2165     // observable outside the program, so no need to cause a waitcnt for LDS
2166     // address space operations.
2167     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2168                           Position::AFTER);
2169     return Changed;
2170   }
2171 
2172   if (IsNonTemporal) {
2173     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2174     // and L2 cache policy to STREAM.
2175     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2176     // to MISS_EVICT and the L2 cache policy to STREAM.
2177     if (Op == SIMemOp::STORE)
2178       Changed |= enableGLCBit(MI);
2179     Changed |= enableSLCBit(MI);
2180 
2181     // Set MALL NOALLOC for load and store instructions.
2182     Changed |= enableDLCBit(MI);
2183     return Changed;
2184   }
2185 
2186   return Changed;
2187 }
2188 
2189 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2190                                 AMDGPU::CPol::CPol Value) const {
2191   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2192   if (!CPol)
2193     return false;
2194 
2195   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2196   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2197     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2198     return true;
2199   }
2200 
2201   return false;
2202 }
2203 
2204 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2205                                    AMDGPU::CPol::CPol Value) const {
2206   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2207   if (!CPol)
2208     return false;
2209 
2210   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2211   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2212     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2213     return true;
2214   }
2215 
2216   return false;
2217 }
2218 
2219 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2220     const MachineBasicBlock::iterator MI) const {
2221   // TODO: implement flag for frontend to give us a hint not to insert waits.
2222 
2223   MachineBasicBlock &MBB = *MI->getParent();
2224   const DebugLoc &DL = MI->getDebugLoc();
2225 
2226   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2227   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2228   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2229   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2230   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2231 
2232   return true;
2233 }
2234 
2235 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2236                                      SIAtomicScope Scope,
2237                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2238                                      bool IsCrossAddrSpaceOrdering,
2239                                      Position Pos) const {
2240   bool Changed = false;
2241 
2242   MachineBasicBlock &MBB = *MI->getParent();
2243   DebugLoc DL = MI->getDebugLoc();
2244 
2245   bool LOADCnt = false;
2246   bool DSCnt = false;
2247   bool STORECnt = false;
2248 
2249   if (Pos == Position::AFTER)
2250     ++MI;
2251 
2252   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2253       SIAtomicAddrSpace::NONE) {
2254     switch (Scope) {
2255     case SIAtomicScope::SYSTEM:
2256     case SIAtomicScope::AGENT:
2257       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2258         LOADCnt |= true;
2259       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2260         STORECnt |= true;
2261       break;
2262     case SIAtomicScope::WORKGROUP:
2263       // In WGP mode the waves of a work-group can be executing on either CU of
2264       // the WGP. Therefore need to wait for operations to complete to ensure
2265       // they are visible to waves in the other CU as the L0 is per CU.
2266       // Otherwise in CU mode and all waves of a work-group are on the same CU
2267       // which shares the same L0.
2268       if (!ST.isCuModeEnabled()) {
2269         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2270           LOADCnt |= true;
2271         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2272           STORECnt |= true;
2273       }
2274       break;
2275     case SIAtomicScope::WAVEFRONT:
2276     case SIAtomicScope::SINGLETHREAD:
2277       // The L0 cache keeps all memory operations in order for
2278       // work-items in the same wavefront.
2279       break;
2280     default:
2281       llvm_unreachable("Unsupported synchronization scope");
2282     }
2283   }
2284 
2285   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2286     switch (Scope) {
2287     case SIAtomicScope::SYSTEM:
2288     case SIAtomicScope::AGENT:
2289     case SIAtomicScope::WORKGROUP:
2290       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2291       // not needed as LDS operations for all waves are executed in a total
2292       // global ordering as observed by all waves. Required if also
2293       // synchronizing with global/GDS memory as LDS operations could be
2294       // reordered with respect to later global/GDS memory operations of the
2295       // same wave.
2296       DSCnt |= IsCrossAddrSpaceOrdering;
2297       break;
2298     case SIAtomicScope::WAVEFRONT:
2299     case SIAtomicScope::SINGLETHREAD:
2300       // The LDS keeps all memory operations in order for
2301       // the same wavefront.
2302       break;
2303     default:
2304       llvm_unreachable("Unsupported synchronization scope");
2305     }
2306   }
2307 
2308   if (LOADCnt) {
2309     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2310     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2311     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2312     Changed = true;
2313   }
2314 
2315   if (STORECnt) {
2316     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2317     Changed = true;
2318   }
2319 
2320   if (DSCnt) {
2321     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2322     Changed = true;
2323   }
2324 
2325   if (Pos == Position::AFTER)
2326     --MI;
2327 
2328   return Changed;
2329 }
2330 
2331 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2332                                         SIAtomicScope Scope,
2333                                         SIAtomicAddrSpace AddrSpace,
2334                                         Position Pos) const {
2335   if (!InsertCacheInv)
2336     return false;
2337 
2338   MachineBasicBlock &MBB = *MI->getParent();
2339   DebugLoc DL = MI->getDebugLoc();
2340 
2341   /// The scratch address space does not need the global memory cache
2342   /// to be flushed as all memory operations by the same thread are
2343   /// sequentially consistent, and no other thread can access scratch
2344   /// memory.
2345 
2346   /// Other address spaces do not have a cache.
2347   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2348     return false;
2349 
2350   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2351   switch (Scope) {
2352   case SIAtomicScope::SYSTEM:
2353     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2354     break;
2355   case SIAtomicScope::AGENT:
2356     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2357     break;
2358   case SIAtomicScope::WORKGROUP:
2359     // In WGP mode the waves of a work-group can be executing on either CU of
2360     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2361     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2362     // the L0 does not need to be invalidated.
2363     if (ST.isCuModeEnabled())
2364       return false;
2365 
2366     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2367     break;
2368   case SIAtomicScope::WAVEFRONT:
2369   case SIAtomicScope::SINGLETHREAD:
2370     // No cache to invalidate.
2371     return false;
2372   default:
2373     llvm_unreachable("Unsupported synchronization scope");
2374   }
2375 
2376   if (Pos == Position::AFTER)
2377     ++MI;
2378 
2379   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2380 
2381   if (Pos == Position::AFTER)
2382     --MI;
2383 
2384   return true;
2385 }
2386 
2387 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2388     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2389     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2390 
2391   // Only handle load and store, not atomic read-modify-write instructions.
2392   assert(MI->mayLoad() ^ MI->mayStore());
2393 
2394   // Only update load and store, not LLVM IR atomic read-modify-write
2395   // instructions. The latter are always marked as volatile so cannot sensibly
2396   // handle it as do not want to pessimize all atomics. Also they do not support
2397   // the nontemporal attribute.
2398   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2399 
2400   bool Changed = false;
2401 
2402   if (IsLastUse) {
2403     // Set last-use hint.
2404     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2405   } else if (IsNonTemporal) {
2406     // Set non-temporal hint for all cache levels.
2407     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2408   }
2409 
2410   if (IsVolatile) {
2411     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2412 
2413     if (Op == SIMemOp::STORE)
2414       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2415 
2416     // Ensure operation has completed at system scope to cause all volatile
2417     // operations to be visible outside the program in a global order. Do not
2418     // request cross address space as only the global address space can be
2419     // observable outside the program, so no need to cause a waitcnt for LDS
2420     // address space operations.
2421     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2422                           Position::AFTER);
2423   }
2424 
2425   return Changed;
2426 }
2427 
2428 bool SIGfx12CacheControl::expandSystemScopeStore(
2429     MachineBasicBlock::iterator &MI) const {
2430   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2431   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2432     return insertWaitsBeforeSystemScopeStore(MI);
2433 
2434   return false;
2435 }
2436 
2437 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2438   if (AtomicPseudoMIs.empty())
2439     return false;
2440 
2441   for (auto &MI : AtomicPseudoMIs)
2442     MI->eraseFromParent();
2443 
2444   AtomicPseudoMIs.clear();
2445   return true;
2446 }
2447 
2448 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2449                                    MachineBasicBlock::iterator &MI) {
2450   assert(MI->mayLoad() && !MI->mayStore());
2451 
2452   bool Changed = false;
2453 
2454   if (MOI.isAtomic()) {
2455     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2456         MOI.getOrdering() == AtomicOrdering::Acquire ||
2457         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2458       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2459                                            MOI.getOrderingAddrSpace());
2460     }
2461 
2462     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2463       Changed |= CC->insertWait(MI, MOI.getScope(),
2464                                 MOI.getOrderingAddrSpace(),
2465                                 SIMemOp::LOAD | SIMemOp::STORE,
2466                                 MOI.getIsCrossAddressSpaceOrdering(),
2467                                 Position::BEFORE);
2468 
2469     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2470         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2471       Changed |= CC->insertWait(MI, MOI.getScope(),
2472                                 MOI.getInstrAddrSpace(),
2473                                 SIMemOp::LOAD,
2474                                 MOI.getIsCrossAddressSpaceOrdering(),
2475                                 Position::AFTER);
2476       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2477                                    MOI.getOrderingAddrSpace(),
2478                                    Position::AFTER);
2479     }
2480 
2481     return Changed;
2482   }
2483 
2484   // Atomic instructions already bypass caches to the scope specified by the
2485   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2486   // instructions need additional treatment.
2487   Changed |= CC->enableVolatileAndOrNonTemporal(
2488       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2489       MOI.isNonTemporal(), MOI.isLastUse());
2490 
2491   return Changed;
2492 }
2493 
2494 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2495                                     MachineBasicBlock::iterator &MI) {
2496   assert(!MI->mayLoad() && MI->mayStore());
2497 
2498   bool Changed = false;
2499 
2500   if (MOI.isAtomic()) {
2501     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2502         MOI.getOrdering() == AtomicOrdering::Release ||
2503         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2504       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2505                                             MOI.getOrderingAddrSpace());
2506     }
2507 
2508     if (MOI.getOrdering() == AtomicOrdering::Release ||
2509         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2510       Changed |= CC->insertRelease(MI, MOI.getScope(),
2511                                    MOI.getOrderingAddrSpace(),
2512                                    MOI.getIsCrossAddressSpaceOrdering(),
2513                                    Position::BEFORE);
2514 
2515     return Changed;
2516   }
2517 
2518   // Atomic instructions already bypass caches to the scope specified by the
2519   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2520   // need additional treatment.
2521   Changed |= CC->enableVolatileAndOrNonTemporal(
2522       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2523       MOI.isNonTemporal());
2524 
2525   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2526   // instruction field, do not confuse it with atomic scope.
2527   Changed |= CC->expandSystemScopeStore(MI);
2528   return Changed;
2529 }
2530 
2531 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2532                                           MachineBasicBlock::iterator &MI) {
2533   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2534 
2535   AtomicPseudoMIs.push_back(MI);
2536   bool Changed = false;
2537 
2538   if (MOI.isAtomic()) {
2539     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2540       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2541                                 SIMemOp::LOAD | SIMemOp::STORE,
2542                                 MOI.getIsCrossAddressSpaceOrdering(),
2543                                 Position::BEFORE);
2544 
2545     if (MOI.getOrdering() == AtomicOrdering::Release ||
2546         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2547         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2548       /// TODO: This relies on a barrier always generating a waitcnt
2549       /// for LDS to ensure it is not reordered with the completion of
2550       /// the proceeding LDS operations. If barrier had a memory
2551       /// ordering and memory scope, then library does not need to
2552       /// generate a fence. Could add support in this file for
2553       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2554       /// adding S_WAITCNT before a S_BARRIER.
2555       Changed |= CC->insertRelease(MI, MOI.getScope(),
2556                                    MOI.getOrderingAddrSpace(),
2557                                    MOI.getIsCrossAddressSpaceOrdering(),
2558                                    Position::BEFORE);
2559 
2560     // TODO: If both release and invalidate are happening they could be combined
2561     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2562     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2563     // track cache invalidate and write back instructions.
2564 
2565     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2566         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2567         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2568       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2569                                    MOI.getOrderingAddrSpace(),
2570                                    Position::BEFORE);
2571 
2572     return Changed;
2573   }
2574 
2575   return Changed;
2576 }
2577 
2578 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2579   MachineBasicBlock::iterator &MI) {
2580   assert(MI->mayLoad() && MI->mayStore());
2581 
2582   bool Changed = false;
2583 
2584   if (MOI.isAtomic()) {
2585     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2586         MOI.getOrdering() == AtomicOrdering::Acquire ||
2587         MOI.getOrdering() == AtomicOrdering::Release ||
2588         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2589         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2590       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2591                                           MOI.getInstrAddrSpace());
2592     }
2593 
2594     if (MOI.getOrdering() == AtomicOrdering::Release ||
2595         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2596         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2597         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2598       Changed |= CC->insertRelease(MI, MOI.getScope(),
2599                                    MOI.getOrderingAddrSpace(),
2600                                    MOI.getIsCrossAddressSpaceOrdering(),
2601                                    Position::BEFORE);
2602 
2603     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2604         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2605         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2606         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2607         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2608       Changed |= CC->insertWait(MI, MOI.getScope(),
2609                                 MOI.getInstrAddrSpace(),
2610                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2611                                                    SIMemOp::STORE,
2612                                 MOI.getIsCrossAddressSpaceOrdering(),
2613                                 Position::AFTER);
2614       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2615                                    MOI.getOrderingAddrSpace(),
2616                                    Position::AFTER);
2617     }
2618 
2619     return Changed;
2620   }
2621 
2622   return Changed;
2623 }
2624 
2625 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2626   bool Changed = false;
2627 
2628   SIMemOpAccess MOA(MF);
2629   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2630 
2631   for (auto &MBB : MF) {
2632     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2633 
2634       // Unbundle instructions after the post-RA scheduler.
2635       if (MI->isBundle() && MI->mayLoadOrStore()) {
2636         MachineBasicBlock::instr_iterator II(MI->getIterator());
2637         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2638              I != E && I->isBundledWithPred(); ++I) {
2639           I->unbundleFromPred();
2640           for (MachineOperand &MO : I->operands())
2641             if (MO.isReg())
2642               MO.setIsInternalRead(false);
2643         }
2644 
2645         MI->eraseFromParent();
2646         MI = II->getIterator();
2647       }
2648 
2649       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2650         continue;
2651 
2652       if (const auto &MOI = MOA.getLoadInfo(MI))
2653         Changed |= expandLoad(*MOI, MI);
2654       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2655         Changed |= expandStore(*MOI, MI);
2656         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2657       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2658         Changed |= expandAtomicFence(*MOI, MI);
2659       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2660         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2661     }
2662   }
2663 
2664   Changed |= removeAtomicPseudoMIs();
2665   return Changed;
2666 }
2667 
2668 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2669 
2670 char SIMemoryLegalizer::ID = 0;
2671 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2672 
2673 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2674   return new SIMemoryLegalizer();
2675 }
2676