xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision c1ac6d2dd4ad3b15756d53b4b294843de4c141c2)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/IR/DiagnosticInfo.h"
25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26 #include "llvm/Support/AtomicOrdering.h"
27 #include "llvm/TargetParser/TargetParser.h"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 #define DEBUG_TYPE "si-memory-legalizer"
33 #define PASS_NAME "SI Memory Legalizer"
34 
35 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37     cl::desc("Use this to skip inserting cache invalidating instructions."));
38 
39 namespace {
40 
41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42 
43 /// Memory operation flags. Can be ORed together.
44 enum class SIMemOp {
45   NONE = 0u,
46   LOAD = 1u << 0,
47   STORE = 1u << 1,
48   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49 };
50 
51 /// Position to insert a new instruction relative to an existing
52 /// instruction.
53 enum class Position {
54   BEFORE,
55   AFTER
56 };
57 
58 /// The atomic synchronization scopes supported by the AMDGPU target.
59 enum class SIAtomicScope {
60   NONE,
61   SINGLETHREAD,
62   WAVEFRONT,
63   WORKGROUP,
64   AGENT,
65   SYSTEM
66 };
67 
68 /// The distinct address spaces supported by the AMDGPU target for
69 /// atomic memory operation. Can be ORed together.
70 enum class SIAtomicAddrSpace {
71   NONE = 0u,
72   GLOBAL = 1u << 0,
73   LDS = 1u << 1,
74   SCRATCH = 1u << 2,
75   GDS = 1u << 3,
76   OTHER = 1u << 4,
77 
78   /// The address spaces that can be accessed by a FLAT instruction.
79   FLAT = GLOBAL | LDS | SCRATCH,
80 
81   /// The address spaces that support atomic instructions.
82   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83 
84   /// All address spaces.
85   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86 
87   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88 };
89 
90 class SIMemOpInfo final {
91 private:
92 
93   friend class SIMemOpAccess;
94 
95   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100   bool IsCrossAddressSpaceOrdering = false;
101   bool IsVolatile = false;
102   bool IsNonTemporal = false;
103   bool IsLastUse = false;
104 
105   SIMemOpInfo(
106       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110       bool IsCrossAddressSpaceOrdering = true,
111       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112       bool IsVolatile = false, bool IsNonTemporal = false,
113       bool IsLastUse = false)
114       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118         IsLastUse(IsLastUse) {
119 
120     if (Ordering == AtomicOrdering::NotAtomic) {
121       assert(Scope == SIAtomicScope::NONE &&
122              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123              !IsCrossAddressSpaceOrdering &&
124              FailureOrdering == AtomicOrdering::NotAtomic);
125       return;
126     }
127 
128     assert(Scope != SIAtomicScope::NONE &&
129            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE &&
131            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132                SIAtomicAddrSpace::NONE);
133 
134     // There is also no cross address space ordering if the ordering
135     // address space is the same as the instruction address space and
136     // only contains a single address space.
137     if ((OrderingAddrSpace == InstrAddrSpace) &&
138         isPowerOf2_32(uint32_t(InstrAddrSpace)))
139       this->IsCrossAddressSpaceOrdering = false;
140 
141     // Limit the scope to the maximum supported by the instruction's address
142     // spaces.
143     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144         SIAtomicAddrSpace::NONE) {
145       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146     } else if ((InstrAddrSpace &
147                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148                SIAtomicAddrSpace::NONE) {
149       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150     } else if ((InstrAddrSpace &
151                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154     }
155   }
156 
157 public:
158   /// \returns Atomic synchronization scope of the machine instruction used to
159   /// create this SIMemOpInfo.
160   SIAtomicScope getScope() const {
161     return Scope;
162   }
163 
164   /// \returns Ordering constraint of the machine instruction used to
165   /// create this SIMemOpInfo.
166   AtomicOrdering getOrdering() const {
167     return Ordering;
168   }
169 
170   /// \returns Failure ordering constraint of the machine instruction used to
171   /// create this SIMemOpInfo.
172   AtomicOrdering getFailureOrdering() const {
173     return FailureOrdering;
174   }
175 
176   /// \returns The address spaces be accessed by the machine
177   /// instruction used to create this SIMemOpInfo.
178   SIAtomicAddrSpace getInstrAddrSpace() const {
179     return InstrAddrSpace;
180   }
181 
182   /// \returns The address spaces that must be ordered by the machine
183   /// instruction used to create this SIMemOpInfo.
184   SIAtomicAddrSpace getOrderingAddrSpace() const {
185     return OrderingAddrSpace;
186   }
187 
188   /// \returns Return true iff memory ordering of operations on
189   /// different address spaces is required.
190   bool getIsCrossAddressSpaceOrdering() const {
191     return IsCrossAddressSpaceOrdering;
192   }
193 
194   /// \returns True if memory access of the machine instruction used to
195   /// create this SIMemOpInfo is volatile, false otherwise.
196   bool isVolatile() const {
197     return IsVolatile;
198   }
199 
200   /// \returns True if memory access of the machine instruction used to
201   /// create this SIMemOpInfo is nontemporal, false otherwise.
202   bool isNonTemporal() const {
203     return IsNonTemporal;
204   }
205 
206   /// \returns True if memory access of the machine instruction used to
207   /// create this SIMemOpInfo is last use, false otherwise.
208   bool isLastUse() const { return IsLastUse; }
209 
210   /// \returns True if ordering constraint of the machine instruction used to
211   /// create this SIMemOpInfo is unordered or higher, false otherwise.
212   bool isAtomic() const {
213     return Ordering != AtomicOrdering::NotAtomic;
214   }
215 
216 };
217 
218 class SIMemOpAccess final {
219 private:
220   AMDGPUMachineModuleInfo *MMI = nullptr;
221 
222   /// Reports unsupported message \p Msg for \p MI to LLVM context.
223   void reportUnsupported(const MachineBasicBlock::iterator &MI,
224                          const char *Msg) const;
225 
226   /// Inspects the target synchronization scope \p SSID and determines
227   /// the SI atomic scope it corresponds to, the address spaces it
228   /// covers, and whether the memory ordering applies between address
229   /// spaces.
230   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232 
233   /// \return Return a bit set of the address spaces accessed by \p AS.
234   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235 
236   /// \returns Info constructed from \p MI, which has at least machine memory
237   /// operand.
238   std::optional<SIMemOpInfo>
239   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240 
241 public:
242   /// Construct class to support accessing the machine memory operands
243   /// of instructions in the machine function \p MF.
244   SIMemOpAccess(MachineFunction &MF);
245 
246   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247   std::optional<SIMemOpInfo>
248   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249 
250   /// \returns Store info if \p MI is a store operation, "std::nullopt"
251   /// otherwise.
252   std::optional<SIMemOpInfo>
253   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254 
255   /// \returns Atomic fence info if \p MI is an atomic fence operation,
256   /// "std::nullopt" otherwise.
257   std::optional<SIMemOpInfo>
258   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261   /// rmw operation, "std::nullopt" otherwise.
262   std::optional<SIMemOpInfo>
263   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264 };
265 
266 class SICacheControl {
267 protected:
268 
269   /// AMDGPU subtarget info.
270   const GCNSubtarget &ST;
271 
272   /// Instruction info.
273   const SIInstrInfo *TII = nullptr;
274 
275   IsaVersion IV;
276 
277   /// Whether to insert cache invalidating instructions.
278   bool InsertCacheInv;
279 
280   SICacheControl(const GCNSubtarget &ST);
281 
282   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283   /// \returns Returns true if \p MI is modified, false otherwise.
284   bool enableNamedBit(const MachineBasicBlock::iterator MI,
285                       AMDGPU::CPol::CPol Bit) const;
286 
287 public:
288 
289   /// Create a cache control for the subtarget \p ST.
290   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291 
292   /// Update \p MI memory load instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296                                      SIAtomicScope Scope,
297                                      SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory store instruction to bypass any caches up to
300   /// the \p Scope memory scope for address spaces \p
301   /// AddrSpace. Return true iff the instruction was modified.
302   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303                                       SIAtomicScope Scope,
304                                       SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory read-modify-write instruction to bypass any caches up
307   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308   /// iff the instruction was modified.
309   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310                                     SIAtomicScope Scope,
311                                     SIAtomicAddrSpace AddrSpace) const = 0;
312 
313   /// Update \p MI memory instruction of kind \p Op associated with address
314   /// spaces \p AddrSpace to indicate it is volatile and/or
315   /// nontemporal/last-use. Return true iff the instruction was modified.
316   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317                                               SIAtomicAddrSpace AddrSpace,
318                                               SIMemOp Op, bool IsVolatile,
319                                               bool IsNonTemporal,
320                                               bool IsLastUse = false) const = 0;
321 
322   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323     return false;
324   };
325 
326   /// Inserts any necessary instructions at position \p Pos relative
327   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328   /// \p Op associated with address spaces \p AddrSpace have completed. Used
329   /// between memory instructions to enforce the order they become visible as
330   /// observed by other memory instructions executing in memory scope \p Scope.
331   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332   /// address spaces. Returns true iff any instructions inserted.
333   virtual bool insertWait(MachineBasicBlock::iterator &MI,
334                           SIAtomicScope Scope,
335                           SIAtomicAddrSpace AddrSpace,
336                           SIMemOp Op,
337                           bool IsCrossAddrSpaceOrdering,
338                           Position Pos) const = 0;
339 
340   /// Inserts any necessary instructions at position \p Pos relative to
341   /// instruction \p MI to ensure any subsequent memory instructions of this
342   /// thread with address spaces \p AddrSpace will observe the previous memory
343   /// operations by any thread for memory scopes up to memory scope \p Scope .
344   /// Returns true iff any instructions inserted.
345   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              Position Pos) const = 0;
349 
350   /// Inserts any necessary instructions at position \p Pos relative to
351   /// instruction \p MI to ensure previous memory instructions by this thread
352   /// with address spaces \p AddrSpace have completed and can be observed by
353   /// subsequent memory instructions by any thread executing in memory scope \p
354   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355   /// between address spaces. Returns true iff any instructions inserted.
356   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357                              SIAtomicScope Scope,
358                              SIAtomicAddrSpace AddrSpace,
359                              bool IsCrossAddrSpaceOrdering,
360                              Position Pos) const = 0;
361 
362   /// Virtual destructor to allow derivations to be deleted.
363   virtual ~SICacheControl() = default;
364 
365   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
366                                    MachineBasicBlock::iterator &MI) const {
367     return false;
368   }
369 };
370 
371 class SIGfx6CacheControl : public SICacheControl {
372 protected:
373 
374   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375   /// is modified, false otherwise.
376   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377     return enableNamedBit(MI, AMDGPU::CPol::GLC);
378   }
379 
380   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381   /// is modified, false otherwise.
382   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383     return enableNamedBit(MI, AMDGPU::CPol::SLC);
384   }
385 
386 public:
387 
388   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
389 
390   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391                              SIAtomicScope Scope,
392                              SIAtomicAddrSpace AddrSpace) const override;
393 
394   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395                               SIAtomicScope Scope,
396                               SIAtomicAddrSpace AddrSpace) const override;
397 
398   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399                             SIAtomicScope Scope,
400                             SIAtomicAddrSpace AddrSpace) const override;
401 
402   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404                                       bool IsVolatile, bool IsNonTemporal,
405                                       bool IsLastUse) const override;
406 
407   bool insertWait(MachineBasicBlock::iterator &MI,
408                   SIAtomicScope Scope,
409                   SIAtomicAddrSpace AddrSpace,
410                   SIMemOp Op,
411                   bool IsCrossAddrSpaceOrdering,
412                   Position Pos) const override;
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419   bool insertRelease(MachineBasicBlock::iterator &MI,
420                      SIAtomicScope Scope,
421                      SIAtomicAddrSpace AddrSpace,
422                      bool IsCrossAddrSpaceOrdering,
423                      Position Pos) const override;
424 };
425 
426 class SIGfx7CacheControl : public SIGfx6CacheControl {
427 public:
428 
429   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
430 
431   bool insertAcquire(MachineBasicBlock::iterator &MI,
432                      SIAtomicScope Scope,
433                      SIAtomicAddrSpace AddrSpace,
434                      Position Pos) const override;
435 
436 };
437 
438 class SIGfx90ACacheControl : public SIGfx7CacheControl {
439 public:
440 
441   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442 
443   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444                              SIAtomicScope Scope,
445                              SIAtomicAddrSpace AddrSpace) const override;
446 
447   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448                               SIAtomicScope Scope,
449                               SIAtomicAddrSpace AddrSpace) const override;
450 
451   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452                             SIAtomicScope Scope,
453                             SIAtomicAddrSpace AddrSpace) const override;
454 
455   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457                                       bool IsVolatile, bool IsNonTemporal,
458                                       bool IsLastUse) const override;
459 
460   bool insertWait(MachineBasicBlock::iterator &MI,
461                   SIAtomicScope Scope,
462                   SIAtomicAddrSpace AddrSpace,
463                   SIMemOp Op,
464                   bool IsCrossAddrSpaceOrdering,
465                   Position Pos) const override;
466 
467   bool insertAcquire(MachineBasicBlock::iterator &MI,
468                      SIAtomicScope Scope,
469                      SIAtomicAddrSpace AddrSpace,
470                      Position Pos) const override;
471 
472   bool insertRelease(MachineBasicBlock::iterator &MI,
473                      SIAtomicScope Scope,
474                      SIAtomicAddrSpace AddrSpace,
475                      bool IsCrossAddrSpaceOrdering,
476                      Position Pos) const override;
477 };
478 
479 class SIGfx940CacheControl : public SIGfx90ACacheControl {
480 protected:
481 
482   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483   /// is modified, false otherwise.
484   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485     return enableNamedBit(MI, AMDGPU::CPol::SC0);
486   }
487 
488   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489   /// is modified, false otherwise.
490   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491     return enableNamedBit(MI, AMDGPU::CPol::SC1);
492   }
493 
494   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495   /// is modified, false otherwise.
496   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497     return enableNamedBit(MI, AMDGPU::CPol::NT);
498   }
499 
500 public:
501 
502   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
503 
504   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505                              SIAtomicScope Scope,
506                              SIAtomicAddrSpace AddrSpace) const override;
507 
508   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509                               SIAtomicScope Scope,
510                               SIAtomicAddrSpace AddrSpace) const override;
511 
512   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513                             SIAtomicScope Scope,
514                             SIAtomicAddrSpace AddrSpace) const override;
515 
516   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518                                       bool IsVolatile, bool IsNonTemporal,
519                                       bool IsLastUse) const override;
520 
521   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523 
524   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526                      Position Pos) const override;
527 
528   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529                            MachineBasicBlock::iterator &MI) const override {
530     bool Changed = false;
531     if (ST.hasForceStoreSC0SC1() &&
532         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
533                                     SIAtomicAddrSpace::GLOBAL |
534                                     SIAtomicAddrSpace::OTHER)) !=
535          SIAtomicAddrSpace::NONE) {
536       Changed |= enableSC0Bit(MI);
537       Changed |= enableSC1Bit(MI);
538     }
539     return Changed;
540   }
541 };
542 
543 class SIGfx10CacheControl : public SIGfx7CacheControl {
544 protected:
545 
546   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547   /// is modified, false otherwise.
548   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549     return enableNamedBit(MI, AMDGPU::CPol::DLC);
550   }
551 
552 public:
553 
554   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
555 
556   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557                              SIAtomicScope Scope,
558                              SIAtomicAddrSpace AddrSpace) const override;
559 
560   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562                                       bool IsVolatile, bool IsNonTemporal,
563                                       bool IsLastUse) const override;
564 
565   bool insertWait(MachineBasicBlock::iterator &MI,
566                   SIAtomicScope Scope,
567                   SIAtomicAddrSpace AddrSpace,
568                   SIMemOp Op,
569                   bool IsCrossAddrSpaceOrdering,
570                   Position Pos) const override;
571 
572   bool insertAcquire(MachineBasicBlock::iterator &MI,
573                      SIAtomicScope Scope,
574                      SIAtomicAddrSpace AddrSpace,
575                      Position Pos) const override;
576 };
577 
578 class SIGfx11CacheControl : public SIGfx10CacheControl {
579 public:
580   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
581 
582   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583                              SIAtomicScope Scope,
584                              SIAtomicAddrSpace AddrSpace) const override;
585 
586   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588                                       bool IsVolatile, bool IsNonTemporal,
589                                       bool IsLastUse) const override;
590 };
591 
592 class SIGfx12CacheControl : public SIGfx11CacheControl {
593 protected:
594   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595   // \returns Returns true if \p MI is modified, false otherwise.
596   bool setTH(const MachineBasicBlock::iterator MI,
597              AMDGPU::CPol::CPol Value) const;
598   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
599   // MI. \returns Returns true if \p MI is modified, false otherwise.
600   bool setScope(const MachineBasicBlock::iterator MI,
601                 AMDGPU::CPol::CPol Value) const;
602 
603   // Stores with system scope (SCOPE_SYS) need to wait for:
604   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605   // - non-returning-atomics       - wait for STORECNT==0
606   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607   //   since it does not distinguish atomics-with-return from regular stores.
608   // There is no need to wait if memory is cached (mtype != UC).
609   bool
610   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611 
612 public:
613   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
614 
615   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
616                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
617                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
618 
619   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
620                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
621 
622   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
623                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
624                                       bool IsVolatile, bool IsNonTemporal,
625                                       bool IsLastUse) const override;
626 
627   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
628 };
629 
630 class SIMemoryLegalizer final : public MachineFunctionPass {
631 private:
632 
633   /// Cache Control.
634   std::unique_ptr<SICacheControl> CC = nullptr;
635 
636   /// List of atomic pseudo instructions.
637   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
638 
639   /// Return true iff instruction \p MI is a atomic instruction that
640   /// returns a result.
641   bool isAtomicRet(const MachineInstr &MI) const {
642     return SIInstrInfo::isAtomicRet(MI);
643   }
644 
645   /// Removes all processed atomic pseudo instructions from the current
646   /// function. Returns true if current function is modified, false otherwise.
647   bool removeAtomicPseudoMIs();
648 
649   /// Expands load operation \p MI. Returns true if instructions are
650   /// added/deleted or \p MI is modified, false otherwise.
651   bool expandLoad(const SIMemOpInfo &MOI,
652                   MachineBasicBlock::iterator &MI);
653   /// Expands store operation \p MI. Returns true if instructions are
654   /// added/deleted or \p MI is modified, false otherwise.
655   bool expandStore(const SIMemOpInfo &MOI,
656                    MachineBasicBlock::iterator &MI);
657   /// Expands atomic fence operation \p MI. Returns true if
658   /// instructions are added/deleted or \p MI is modified, false otherwise.
659   bool expandAtomicFence(const SIMemOpInfo &MOI,
660                          MachineBasicBlock::iterator &MI);
661   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
662   /// instructions are added/deleted or \p MI is modified, false otherwise.
663   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
664                                 MachineBasicBlock::iterator &MI);
665 
666 public:
667   static char ID;
668 
669   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
670 
671   void getAnalysisUsage(AnalysisUsage &AU) const override {
672     AU.setPreservesCFG();
673     MachineFunctionPass::getAnalysisUsage(AU);
674   }
675 
676   StringRef getPassName() const override {
677     return PASS_NAME;
678   }
679 
680   bool runOnMachineFunction(MachineFunction &MF) override;
681 };
682 
683 static const StringMap<SIAtomicAddrSpace> ASNames = {{
684     {"global", SIAtomicAddrSpace::GLOBAL},
685     {"local", SIAtomicAddrSpace::LDS},
686 }};
687 
688 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
689   const MachineFunction *MF = MI.getMF();
690   const Function &Fn = MF->getFunction();
691   SmallString<128> Str;
692   raw_svector_ostream OS(Str);
693   OS << "unknown address space '" << AS << "'; expected one of ";
694   ListSeparator LS;
695   for (const auto &[Name, Val] : ASNames)
696     OS << LS << '\'' << Name << '\'';
697   DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
698   Fn.getContext().diagnose(BadTag);
699 }
700 
701 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
702 /// If this tag isn't present, or if it has no meaningful values, returns \p
703 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
704 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
705                                                SIAtomicAddrSpace Default) {
706   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
707 
708   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
709   if (!MMRA)
710     return Default;
711 
712   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
713   for (const auto &[Prefix, Suffix] : MMRA) {
714     if (Prefix != FenceASPrefix)
715       continue;
716 
717     if (auto It = ASNames.find(Suffix); It != ASNames.end())
718       Result |= It->second;
719     else
720       diagnoseUnknownMMRAASName(MI, Suffix);
721   }
722 
723   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
724 }
725 
726 } // end namespace anonymous
727 
728 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
729                                       const char *Msg) const {
730   const Function &Func = MI->getParent()->getParent()->getFunction();
731   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
732   Func.getContext().diagnose(Diag);
733 }
734 
735 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
736 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
737                                SIAtomicAddrSpace InstrAddrSpace) const {
738   if (SSID == SyncScope::System)
739     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
740   if (SSID == MMI->getAgentSSID())
741     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
742   if (SSID == MMI->getWorkgroupSSID())
743     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
744                       true);
745   if (SSID == MMI->getWavefrontSSID())
746     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
747                       true);
748   if (SSID == SyncScope::SingleThread)
749     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
750                       true);
751   if (SSID == MMI->getSystemOneAddressSpaceSSID())
752     return std::tuple(SIAtomicScope::SYSTEM,
753                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
754   if (SSID == MMI->getAgentOneAddressSpaceSSID())
755     return std::tuple(SIAtomicScope::AGENT,
756                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
757   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
758     return std::tuple(SIAtomicScope::WORKGROUP,
759                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
760   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
761     return std::tuple(SIAtomicScope::WAVEFRONT,
762                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
763   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
764     return std::tuple(SIAtomicScope::SINGLETHREAD,
765                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
766   return std::nullopt;
767 }
768 
769 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
770   if (AS == AMDGPUAS::FLAT_ADDRESS)
771     return SIAtomicAddrSpace::FLAT;
772   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
773     return SIAtomicAddrSpace::GLOBAL;
774   if (AS == AMDGPUAS::LOCAL_ADDRESS)
775     return SIAtomicAddrSpace::LDS;
776   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
777     return SIAtomicAddrSpace::SCRATCH;
778   if (AS == AMDGPUAS::REGION_ADDRESS)
779     return SIAtomicAddrSpace::GDS;
780 
781   return SIAtomicAddrSpace::OTHER;
782 }
783 
784 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
785   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
786 }
787 
788 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
789     const MachineBasicBlock::iterator &MI) const {
790   assert(MI->getNumMemOperands() > 0);
791 
792   SyncScope::ID SSID = SyncScope::SingleThread;
793   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
794   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
795   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
796   bool IsNonTemporal = true;
797   bool IsVolatile = false;
798   bool IsLastUse = false;
799 
800   // Validator should check whether or not MMOs cover the entire set of
801   // locations accessed by the memory instruction.
802   for (const auto &MMO : MI->memoperands()) {
803     IsNonTemporal &= MMO->isNonTemporal();
804     IsVolatile |= MMO->isVolatile();
805     IsLastUse |= MMO->getFlags() & MOLastUse;
806     InstrAddrSpace |=
807       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
808     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
809     if (OpOrdering != AtomicOrdering::NotAtomic) {
810       const auto &IsSyncScopeInclusion =
811           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
812       if (!IsSyncScopeInclusion) {
813         reportUnsupported(MI,
814           "Unsupported non-inclusive atomic synchronization scope");
815         return std::nullopt;
816       }
817 
818       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
819       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
820       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
821              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
822       FailureOrdering =
823           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
824     }
825   }
826 
827   SIAtomicScope Scope = SIAtomicScope::NONE;
828   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
829   bool IsCrossAddressSpaceOrdering = false;
830   if (Ordering != AtomicOrdering::NotAtomic) {
831     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
832     if (!ScopeOrNone) {
833       reportUnsupported(MI, "Unsupported atomic synchronization scope");
834       return std::nullopt;
835     }
836     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
837         *ScopeOrNone;
838     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
839         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
840         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
841       reportUnsupported(MI, "Unsupported atomic address space");
842       return std::nullopt;
843     }
844   }
845   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
846                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
847                      IsNonTemporal, IsLastUse);
848 }
849 
850 std::optional<SIMemOpInfo>
851 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
852   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
853 
854   if (!(MI->mayLoad() && !MI->mayStore()))
855     return std::nullopt;
856 
857   // Be conservative if there are no memory operands.
858   if (MI->getNumMemOperands() == 0)
859     return SIMemOpInfo();
860 
861   return constructFromMIWithMMO(MI);
862 }
863 
864 std::optional<SIMemOpInfo>
865 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
866   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
867 
868   if (!(!MI->mayLoad() && MI->mayStore()))
869     return std::nullopt;
870 
871   // Be conservative if there are no memory operands.
872   if (MI->getNumMemOperands() == 0)
873     return SIMemOpInfo();
874 
875   return constructFromMIWithMMO(MI);
876 }
877 
878 std::optional<SIMemOpInfo>
879 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
880   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
881 
882   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
883     return std::nullopt;
884 
885   AtomicOrdering Ordering =
886     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
887 
888   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
889   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
890   if (!ScopeOrNone) {
891     reportUnsupported(MI, "Unsupported atomic synchronization scope");
892     return std::nullopt;
893   }
894 
895   SIAtomicScope Scope = SIAtomicScope::NONE;
896   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
897   bool IsCrossAddressSpaceOrdering = false;
898   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
899       *ScopeOrNone;
900 
901   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
902       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
903     reportUnsupported(MI, "Unsupported atomic address space");
904     return std::nullopt;
905   }
906 
907   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
908                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
909 }
910 
911 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
912     const MachineBasicBlock::iterator &MI) const {
913   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
914 
915   if (!(MI->mayLoad() && MI->mayStore()))
916     return std::nullopt;
917 
918   // Be conservative if there are no memory operands.
919   if (MI->getNumMemOperands() == 0)
920     return SIMemOpInfo();
921 
922   return constructFromMIWithMMO(MI);
923 }
924 
925 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
926   TII = ST.getInstrInfo();
927   IV = getIsaVersion(ST.getCPU());
928   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
929 }
930 
931 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
932                                     AMDGPU::CPol::CPol Bit) const {
933   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
934   if (!CPol)
935     return false;
936 
937   CPol->setImm(CPol->getImm() | Bit);
938   return true;
939 }
940 
941 /* static */
942 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
943   GCNSubtarget::Generation Generation = ST.getGeneration();
944   if (ST.hasGFX940Insts())
945     return std::make_unique<SIGfx940CacheControl>(ST);
946   if (ST.hasGFX90AInsts())
947     return std::make_unique<SIGfx90ACacheControl>(ST);
948   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
949     return std::make_unique<SIGfx6CacheControl>(ST);
950   if (Generation < AMDGPUSubtarget::GFX10)
951     return std::make_unique<SIGfx7CacheControl>(ST);
952   if (Generation < AMDGPUSubtarget::GFX11)
953     return std::make_unique<SIGfx10CacheControl>(ST);
954   if (Generation < AMDGPUSubtarget::GFX12)
955     return std::make_unique<SIGfx11CacheControl>(ST);
956   return std::make_unique<SIGfx12CacheControl>(ST);
957 }
958 
959 bool SIGfx6CacheControl::enableLoadCacheBypass(
960     const MachineBasicBlock::iterator &MI,
961     SIAtomicScope Scope,
962     SIAtomicAddrSpace AddrSpace) const {
963   assert(MI->mayLoad() && !MI->mayStore());
964   bool Changed = false;
965 
966   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
967     switch (Scope) {
968     case SIAtomicScope::SYSTEM:
969     case SIAtomicScope::AGENT:
970       // Set L1 cache policy to MISS_EVICT.
971       // Note: there is no L2 cache bypass policy at the ISA level.
972       Changed |= enableGLCBit(MI);
973       break;
974     case SIAtomicScope::WORKGROUP:
975     case SIAtomicScope::WAVEFRONT:
976     case SIAtomicScope::SINGLETHREAD:
977       // No cache to bypass.
978       break;
979     default:
980       llvm_unreachable("Unsupported synchronization scope");
981     }
982   }
983 
984   /// The scratch address space does not need the global memory caches
985   /// to be bypassed as all memory operations by the same thread are
986   /// sequentially consistent, and no other thread can access scratch
987   /// memory.
988 
989   /// Other address spaces do not have a cache.
990 
991   return Changed;
992 }
993 
994 bool SIGfx6CacheControl::enableStoreCacheBypass(
995     const MachineBasicBlock::iterator &MI,
996     SIAtomicScope Scope,
997     SIAtomicAddrSpace AddrSpace) const {
998   assert(!MI->mayLoad() && MI->mayStore());
999   bool Changed = false;
1000 
1001   /// The L1 cache is write through so does not need to be bypassed. There is no
1002   /// bypass control for the L2 cache at the isa level.
1003 
1004   return Changed;
1005 }
1006 
1007 bool SIGfx6CacheControl::enableRMWCacheBypass(
1008     const MachineBasicBlock::iterator &MI,
1009     SIAtomicScope Scope,
1010     SIAtomicAddrSpace AddrSpace) const {
1011   assert(MI->mayLoad() && MI->mayStore());
1012   bool Changed = false;
1013 
1014   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1015   /// bypassed, and the GLC bit is instead used to indicate if they are
1016   /// return or no-return.
1017   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1018 
1019   return Changed;
1020 }
1021 
1022 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1023     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1024     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1025   // Only handle load and store, not atomic read-modify-write insructions. The
1026   // latter use glc to indicate if the atomic returns a result and so must not
1027   // be used for cache control.
1028   assert(MI->mayLoad() ^ MI->mayStore());
1029 
1030   // Only update load and store, not LLVM IR atomic read-modify-write
1031   // instructions. The latter are always marked as volatile so cannot sensibly
1032   // handle it as do not want to pessimize all atomics. Also they do not support
1033   // the nontemporal attribute.
1034   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1035 
1036   bool Changed = false;
1037 
1038   if (IsVolatile) {
1039     // Set L1 cache policy to be MISS_EVICT for load instructions
1040     // and MISS_LRU for store instructions.
1041     // Note: there is no L2 cache bypass policy at the ISA level.
1042     if (Op == SIMemOp::LOAD)
1043       Changed |= enableGLCBit(MI);
1044 
1045     // Ensure operation has completed at system scope to cause all volatile
1046     // operations to be visible outside the program in a global order. Do not
1047     // request cross address space as only the global address space can be
1048     // observable outside the program, so no need to cause a waitcnt for LDS
1049     // address space operations.
1050     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1051                           Position::AFTER);
1052 
1053     return Changed;
1054   }
1055 
1056   if (IsNonTemporal) {
1057     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1058     // for both loads and stores, and the L2 cache policy to STREAM.
1059     Changed |= enableGLCBit(MI);
1060     Changed |= enableSLCBit(MI);
1061     return Changed;
1062   }
1063 
1064   return Changed;
1065 }
1066 
1067 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1068                                     SIAtomicScope Scope,
1069                                     SIAtomicAddrSpace AddrSpace,
1070                                     SIMemOp Op,
1071                                     bool IsCrossAddrSpaceOrdering,
1072                                     Position Pos) const {
1073   bool Changed = false;
1074 
1075   MachineBasicBlock &MBB = *MI->getParent();
1076   DebugLoc DL = MI->getDebugLoc();
1077 
1078   if (Pos == Position::AFTER)
1079     ++MI;
1080 
1081   bool VMCnt = false;
1082   bool LGKMCnt = false;
1083 
1084   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1085       SIAtomicAddrSpace::NONE) {
1086     switch (Scope) {
1087     case SIAtomicScope::SYSTEM:
1088     case SIAtomicScope::AGENT:
1089       VMCnt |= true;
1090       break;
1091     case SIAtomicScope::WORKGROUP:
1092     case SIAtomicScope::WAVEFRONT:
1093     case SIAtomicScope::SINGLETHREAD:
1094       // The L1 cache keeps all memory operations in order for
1095       // wavefronts in the same work-group.
1096       break;
1097     default:
1098       llvm_unreachable("Unsupported synchronization scope");
1099     }
1100   }
1101 
1102   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1103     switch (Scope) {
1104     case SIAtomicScope::SYSTEM:
1105     case SIAtomicScope::AGENT:
1106     case SIAtomicScope::WORKGROUP:
1107       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1108       // not needed as LDS operations for all waves are executed in a total
1109       // global ordering as observed by all waves. Required if also
1110       // synchronizing with global/GDS memory as LDS operations could be
1111       // reordered with respect to later global/GDS memory operations of the
1112       // same wave.
1113       LGKMCnt |= IsCrossAddrSpaceOrdering;
1114       break;
1115     case SIAtomicScope::WAVEFRONT:
1116     case SIAtomicScope::SINGLETHREAD:
1117       // The LDS keeps all memory operations in order for
1118       // the same wavefront.
1119       break;
1120     default:
1121       llvm_unreachable("Unsupported synchronization scope");
1122     }
1123   }
1124 
1125   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1126     switch (Scope) {
1127     case SIAtomicScope::SYSTEM:
1128     case SIAtomicScope::AGENT:
1129       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1130       // is not needed as GDS operations for all waves are executed in a total
1131       // global ordering as observed by all waves. Required if also
1132       // synchronizing with global/LDS memory as GDS operations could be
1133       // reordered with respect to later global/LDS memory operations of the
1134       // same wave.
1135       LGKMCnt |= IsCrossAddrSpaceOrdering;
1136       break;
1137     case SIAtomicScope::WORKGROUP:
1138     case SIAtomicScope::WAVEFRONT:
1139     case SIAtomicScope::SINGLETHREAD:
1140       // The GDS keeps all memory operations in order for
1141       // the same work-group.
1142       break;
1143     default:
1144       llvm_unreachable("Unsupported synchronization scope");
1145     }
1146   }
1147 
1148   if (VMCnt || LGKMCnt) {
1149     unsigned WaitCntImmediate =
1150       AMDGPU::encodeWaitcnt(IV,
1151                             VMCnt ? 0 : getVmcntBitMask(IV),
1152                             getExpcntBitMask(IV),
1153                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1154     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1155         .addImm(WaitCntImmediate);
1156     Changed = true;
1157   }
1158 
1159   if (Pos == Position::AFTER)
1160     --MI;
1161 
1162   return Changed;
1163 }
1164 
1165 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1166                                        SIAtomicScope Scope,
1167                                        SIAtomicAddrSpace AddrSpace,
1168                                        Position Pos) const {
1169   if (!InsertCacheInv)
1170     return false;
1171 
1172   bool Changed = false;
1173 
1174   MachineBasicBlock &MBB = *MI->getParent();
1175   DebugLoc DL = MI->getDebugLoc();
1176 
1177   if (Pos == Position::AFTER)
1178     ++MI;
1179 
1180   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1181     switch (Scope) {
1182     case SIAtomicScope::SYSTEM:
1183     case SIAtomicScope::AGENT:
1184       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1185       Changed = true;
1186       break;
1187     case SIAtomicScope::WORKGROUP:
1188     case SIAtomicScope::WAVEFRONT:
1189     case SIAtomicScope::SINGLETHREAD:
1190       // No cache to invalidate.
1191       break;
1192     default:
1193       llvm_unreachable("Unsupported synchronization scope");
1194     }
1195   }
1196 
1197   /// The scratch address space does not need the global memory cache
1198   /// to be flushed as all memory operations by the same thread are
1199   /// sequentially consistent, and no other thread can access scratch
1200   /// memory.
1201 
1202   /// Other address spaces do not have a cache.
1203 
1204   if (Pos == Position::AFTER)
1205     --MI;
1206 
1207   return Changed;
1208 }
1209 
1210 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1211                                        SIAtomicScope Scope,
1212                                        SIAtomicAddrSpace AddrSpace,
1213                                        bool IsCrossAddrSpaceOrdering,
1214                                        Position Pos) const {
1215   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1216                     IsCrossAddrSpaceOrdering, Pos);
1217 }
1218 
1219 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1220                                        SIAtomicScope Scope,
1221                                        SIAtomicAddrSpace AddrSpace,
1222                                        Position Pos) const {
1223   if (!InsertCacheInv)
1224     return false;
1225 
1226   bool Changed = false;
1227 
1228   MachineBasicBlock &MBB = *MI->getParent();
1229   DebugLoc DL = MI->getDebugLoc();
1230 
1231   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1232 
1233   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1234                                     ? AMDGPU::BUFFER_WBINVL1
1235                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1236 
1237   if (Pos == Position::AFTER)
1238     ++MI;
1239 
1240   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1241     switch (Scope) {
1242     case SIAtomicScope::SYSTEM:
1243     case SIAtomicScope::AGENT:
1244       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1245       Changed = true;
1246       break;
1247     case SIAtomicScope::WORKGROUP:
1248     case SIAtomicScope::WAVEFRONT:
1249     case SIAtomicScope::SINGLETHREAD:
1250       // No cache to invalidate.
1251       break;
1252     default:
1253       llvm_unreachable("Unsupported synchronization scope");
1254     }
1255   }
1256 
1257   /// The scratch address space does not need the global memory cache
1258   /// to be flushed as all memory operations by the same thread are
1259   /// sequentially consistent, and no other thread can access scratch
1260   /// memory.
1261 
1262   /// Other address spaces do not have a cache.
1263 
1264   if (Pos == Position::AFTER)
1265     --MI;
1266 
1267   return Changed;
1268 }
1269 
1270 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1271     const MachineBasicBlock::iterator &MI,
1272     SIAtomicScope Scope,
1273     SIAtomicAddrSpace AddrSpace) const {
1274   assert(MI->mayLoad() && !MI->mayStore());
1275   bool Changed = false;
1276 
1277   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1278     switch (Scope) {
1279     case SIAtomicScope::SYSTEM:
1280     case SIAtomicScope::AGENT:
1281       // Set the L1 cache policy to MISS_LRU.
1282       // Note: there is no L2 cache bypass policy at the ISA level.
1283       Changed |= enableGLCBit(MI);
1284       break;
1285     case SIAtomicScope::WORKGROUP:
1286       // In threadgroup split mode the waves of a work-group can be executing on
1287       // different CUs. Therefore need to bypass the L1 which is per CU.
1288       // Otherwise in non-threadgroup split mode all waves of a work-group are
1289       // on the same CU, and so the L1 does not need to be bypassed.
1290       if (ST.isTgSplitEnabled())
1291         Changed |= enableGLCBit(MI);
1292       break;
1293     case SIAtomicScope::WAVEFRONT:
1294     case SIAtomicScope::SINGLETHREAD:
1295       // No cache to bypass.
1296       break;
1297     default:
1298       llvm_unreachable("Unsupported synchronization scope");
1299     }
1300   }
1301 
1302   /// The scratch address space does not need the global memory caches
1303   /// to be bypassed as all memory operations by the same thread are
1304   /// sequentially consistent, and no other thread can access scratch
1305   /// memory.
1306 
1307   /// Other address spaces do not have a cache.
1308 
1309   return Changed;
1310 }
1311 
1312 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1313     const MachineBasicBlock::iterator &MI,
1314     SIAtomicScope Scope,
1315     SIAtomicAddrSpace AddrSpace) const {
1316   assert(!MI->mayLoad() && MI->mayStore());
1317   bool Changed = false;
1318 
1319   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1320     switch (Scope) {
1321     case SIAtomicScope::SYSTEM:
1322     case SIAtomicScope::AGENT:
1323       /// Do not set glc for store atomic operations as they implicitly write
1324       /// through the L1 cache.
1325       break;
1326     case SIAtomicScope::WORKGROUP:
1327     case SIAtomicScope::WAVEFRONT:
1328     case SIAtomicScope::SINGLETHREAD:
1329       // No cache to bypass. Store atomics implicitly write through the L1
1330       // cache.
1331       break;
1332     default:
1333       llvm_unreachable("Unsupported synchronization scope");
1334     }
1335   }
1336 
1337   /// The scratch address space does not need the global memory caches
1338   /// to be bypassed as all memory operations by the same thread are
1339   /// sequentially consistent, and no other thread can access scratch
1340   /// memory.
1341 
1342   /// Other address spaces do not have a cache.
1343 
1344   return Changed;
1345 }
1346 
1347 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1348     const MachineBasicBlock::iterator &MI,
1349     SIAtomicScope Scope,
1350     SIAtomicAddrSpace AddrSpace) const {
1351   assert(MI->mayLoad() && MI->mayStore());
1352   bool Changed = false;
1353 
1354   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1355     switch (Scope) {
1356     case SIAtomicScope::SYSTEM:
1357     case SIAtomicScope::AGENT:
1358       /// Do not set glc for RMW atomic operations as they implicitly bypass
1359       /// the L1 cache, and the glc bit is instead used to indicate if they are
1360       /// return or no-return.
1361       break;
1362     case SIAtomicScope::WORKGROUP:
1363     case SIAtomicScope::WAVEFRONT:
1364     case SIAtomicScope::SINGLETHREAD:
1365       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1366       break;
1367     default:
1368       llvm_unreachable("Unsupported synchronization scope");
1369     }
1370   }
1371 
1372   return Changed;
1373 }
1374 
1375 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1376     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1377     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1378   // Only handle load and store, not atomic read-modify-write insructions. The
1379   // latter use glc to indicate if the atomic returns a result and so must not
1380   // be used for cache control.
1381   assert(MI->mayLoad() ^ MI->mayStore());
1382 
1383   // Only update load and store, not LLVM IR atomic read-modify-write
1384   // instructions. The latter are always marked as volatile so cannot sensibly
1385   // handle it as do not want to pessimize all atomics. Also they do not support
1386   // the nontemporal attribute.
1387   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1388 
1389   bool Changed = false;
1390 
1391   if (IsVolatile) {
1392     // Set L1 cache policy to be MISS_EVICT for load instructions
1393     // and MISS_LRU for store instructions.
1394     // Note: there is no L2 cache bypass policy at the ISA level.
1395     if (Op == SIMemOp::LOAD)
1396       Changed |= enableGLCBit(MI);
1397 
1398     // Ensure operation has completed at system scope to cause all volatile
1399     // operations to be visible outside the program in a global order. Do not
1400     // request cross address space as only the global address space can be
1401     // observable outside the program, so no need to cause a waitcnt for LDS
1402     // address space operations.
1403     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1404                           Position::AFTER);
1405 
1406     return Changed;
1407   }
1408 
1409   if (IsNonTemporal) {
1410     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1411     // for both loads and stores, and the L2 cache policy to STREAM.
1412     Changed |= enableGLCBit(MI);
1413     Changed |= enableSLCBit(MI);
1414     return Changed;
1415   }
1416 
1417   return Changed;
1418 }
1419 
1420 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1421                                       SIAtomicScope Scope,
1422                                       SIAtomicAddrSpace AddrSpace,
1423                                       SIMemOp Op,
1424                                       bool IsCrossAddrSpaceOrdering,
1425                                       Position Pos) const {
1426   if (ST.isTgSplitEnabled()) {
1427     // In threadgroup split mode the waves of a work-group can be executing on
1428     // different CUs. Therefore need to wait for global or GDS memory operations
1429     // to complete to ensure they are visible to waves in the other CUs.
1430     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1431     // the same CU, so no need to wait for global memory as all waves in the
1432     // work-group access the same the L1, nor wait for GDS as access are ordered
1433     // on a CU.
1434     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1435                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1436         (Scope == SIAtomicScope::WORKGROUP)) {
1437       // Same as GFX7 using agent scope.
1438       Scope = SIAtomicScope::AGENT;
1439     }
1440     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1441     // LDS memory operations.
1442     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1443   }
1444   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1445                                         IsCrossAddrSpaceOrdering, Pos);
1446 }
1447 
1448 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1449                                          SIAtomicScope Scope,
1450                                          SIAtomicAddrSpace AddrSpace,
1451                                          Position Pos) const {
1452   if (!InsertCacheInv)
1453     return false;
1454 
1455   bool Changed = false;
1456 
1457   MachineBasicBlock &MBB = *MI->getParent();
1458   DebugLoc DL = MI->getDebugLoc();
1459 
1460   if (Pos == Position::AFTER)
1461     ++MI;
1462 
1463   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1464     switch (Scope) {
1465     case SIAtomicScope::SYSTEM:
1466       // Ensures that following loads will not see stale remote VMEM data or
1467       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1468       // CC will never be stale due to the local memory probes.
1469       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1470       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1471       // hardware does not reorder memory operations by the same wave with
1472       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1473       // remove any cache lines of earlier writes by the same wave and ensures
1474       // later reads by the same wave will refetch the cache lines.
1475       Changed = true;
1476       break;
1477     case SIAtomicScope::AGENT:
1478       // Same as GFX7.
1479       break;
1480     case SIAtomicScope::WORKGROUP:
1481       // In threadgroup split mode the waves of a work-group can be executing on
1482       // different CUs. Therefore need to invalidate the L1 which is per CU.
1483       // Otherwise in non-threadgroup split mode all waves of a work-group are
1484       // on the same CU, and so the L1 does not need to be invalidated.
1485       if (ST.isTgSplitEnabled()) {
1486         // Same as GFX7 using agent scope.
1487         Scope = SIAtomicScope::AGENT;
1488       }
1489       break;
1490     case SIAtomicScope::WAVEFRONT:
1491     case SIAtomicScope::SINGLETHREAD:
1492       // Same as GFX7.
1493       break;
1494     default:
1495       llvm_unreachable("Unsupported synchronization scope");
1496     }
1497   }
1498 
1499   /// The scratch address space does not need the global memory cache
1500   /// to be flushed as all memory operations by the same thread are
1501   /// sequentially consistent, and no other thread can access scratch
1502   /// memory.
1503 
1504   /// Other address spaces do not have a cache.
1505 
1506   if (Pos == Position::AFTER)
1507     --MI;
1508 
1509   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1510 
1511   return Changed;
1512 }
1513 
1514 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1515                                          SIAtomicScope Scope,
1516                                          SIAtomicAddrSpace AddrSpace,
1517                                          bool IsCrossAddrSpaceOrdering,
1518                                          Position Pos) const {
1519   bool Changed = false;
1520 
1521   MachineBasicBlock &MBB = *MI->getParent();
1522   const DebugLoc &DL = MI->getDebugLoc();
1523 
1524   if (Pos == Position::AFTER)
1525     ++MI;
1526 
1527   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1528     switch (Scope) {
1529     case SIAtomicScope::SYSTEM:
1530       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1531       // hardware does not reorder memory operations by the same wave with
1532       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1533       // to initiate writeback of any dirty cache lines of earlier writes by the
1534       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1535       // writeback has completed.
1536       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1537         // Set SC bits to indicate system scope.
1538         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1539       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1540       // vmcnt(0)" needed by the "BUFFER_WBL2".
1541       Changed = true;
1542       break;
1543     case SIAtomicScope::AGENT:
1544     case SIAtomicScope::WORKGROUP:
1545     case SIAtomicScope::WAVEFRONT:
1546     case SIAtomicScope::SINGLETHREAD:
1547       // Same as GFX7.
1548       break;
1549     default:
1550       llvm_unreachable("Unsupported synchronization scope");
1551     }
1552   }
1553 
1554   if (Pos == Position::AFTER)
1555     --MI;
1556 
1557   Changed |=
1558       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1559                                         IsCrossAddrSpaceOrdering, Pos);
1560 
1561   return Changed;
1562 }
1563 
1564 bool SIGfx940CacheControl::enableLoadCacheBypass(
1565     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1566     SIAtomicAddrSpace AddrSpace) const {
1567   assert(MI->mayLoad() && !MI->mayStore());
1568   bool Changed = false;
1569 
1570   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1571     switch (Scope) {
1572     case SIAtomicScope::SYSTEM:
1573       // Set SC bits to indicate system scope.
1574       Changed |= enableSC0Bit(MI);
1575       Changed |= enableSC1Bit(MI);
1576       break;
1577     case SIAtomicScope::AGENT:
1578       // Set SC bits to indicate agent scope.
1579       Changed |= enableSC1Bit(MI);
1580       break;
1581     case SIAtomicScope::WORKGROUP:
1582       // In threadgroup split mode the waves of a work-group can be executing on
1583       // different CUs. Therefore need to bypass the L1 which is per CU.
1584       // Otherwise in non-threadgroup split mode all waves of a work-group are
1585       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1586       // bits to indicate work-group scope will do this automatically.
1587       Changed |= enableSC0Bit(MI);
1588       break;
1589     case SIAtomicScope::WAVEFRONT:
1590     case SIAtomicScope::SINGLETHREAD:
1591       // Leave SC bits unset to indicate wavefront scope.
1592       break;
1593     default:
1594       llvm_unreachable("Unsupported synchronization scope");
1595     }
1596   }
1597 
1598   /// The scratch address space does not need the global memory caches
1599   /// to be bypassed as all memory operations by the same thread are
1600   /// sequentially consistent, and no other thread can access scratch
1601   /// memory.
1602 
1603   /// Other address spaces do not have a cache.
1604 
1605   return Changed;
1606 }
1607 
1608 bool SIGfx940CacheControl::enableStoreCacheBypass(
1609     const MachineBasicBlock::iterator &MI,
1610     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1611   assert(!MI->mayLoad() && MI->mayStore());
1612   bool Changed = false;
1613 
1614   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1615     switch (Scope) {
1616     case SIAtomicScope::SYSTEM:
1617       // Set SC bits to indicate system scope.
1618       Changed |= enableSC0Bit(MI);
1619       Changed |= enableSC1Bit(MI);
1620       break;
1621     case SIAtomicScope::AGENT:
1622       // Set SC bits to indicate agent scope.
1623       Changed |= enableSC1Bit(MI);
1624       break;
1625     case SIAtomicScope::WORKGROUP:
1626       // Set SC bits to indicate workgroup scope.
1627       Changed |= enableSC0Bit(MI);
1628       break;
1629     case SIAtomicScope::WAVEFRONT:
1630     case SIAtomicScope::SINGLETHREAD:
1631       // Leave SC bits unset to indicate wavefront scope.
1632       break;
1633     default:
1634       llvm_unreachable("Unsupported synchronization scope");
1635     }
1636   }
1637 
1638   /// The scratch address space does not need the global memory caches
1639   /// to be bypassed as all memory operations by the same thread are
1640   /// sequentially consistent, and no other thread can access scratch
1641   /// memory.
1642 
1643   /// Other address spaces do not have a cache.
1644 
1645   return Changed;
1646 }
1647 
1648 bool SIGfx940CacheControl::enableRMWCacheBypass(
1649     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1650     SIAtomicAddrSpace AddrSpace) const {
1651   assert(MI->mayLoad() && MI->mayStore());
1652   bool Changed = false;
1653 
1654   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1655     switch (Scope) {
1656     case SIAtomicScope::SYSTEM:
1657       // Set SC1 bit to indicate system scope.
1658       Changed |= enableSC1Bit(MI);
1659       break;
1660     case SIAtomicScope::AGENT:
1661     case SIAtomicScope::WORKGROUP:
1662     case SIAtomicScope::WAVEFRONT:
1663     case SIAtomicScope::SINGLETHREAD:
1664       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1665       // to indicate system or agent scope. The SC0 bit is used to indicate if
1666       // they are return or no-return. Leave SC1 bit unset to indicate agent
1667       // scope.
1668       break;
1669     default:
1670       llvm_unreachable("Unsupported synchronization scope");
1671     }
1672   }
1673 
1674   return Changed;
1675 }
1676 
1677 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1678     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1679     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1680   // Only handle load and store, not atomic read-modify-write insructions. The
1681   // latter use glc to indicate if the atomic returns a result and so must not
1682   // be used for cache control.
1683   assert(MI->mayLoad() ^ MI->mayStore());
1684 
1685   // Only update load and store, not LLVM IR atomic read-modify-write
1686   // instructions. The latter are always marked as volatile so cannot sensibly
1687   // handle it as do not want to pessimize all atomics. Also they do not support
1688   // the nontemporal attribute.
1689   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1690 
1691   bool Changed = false;
1692 
1693   if (IsVolatile) {
1694     // Set SC bits to indicate system scope.
1695     Changed |= enableSC0Bit(MI);
1696     Changed |= enableSC1Bit(MI);
1697 
1698     // Ensure operation has completed at system scope to cause all volatile
1699     // operations to be visible outside the program in a global order. Do not
1700     // request cross address space as only the global address space can be
1701     // observable outside the program, so no need to cause a waitcnt for LDS
1702     // address space operations.
1703     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1704                           Position::AFTER);
1705 
1706     return Changed;
1707   }
1708 
1709   if (IsNonTemporal) {
1710     Changed |= enableNTBit(MI);
1711     return Changed;
1712   }
1713 
1714   return Changed;
1715 }
1716 
1717 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1718                                          SIAtomicScope Scope,
1719                                          SIAtomicAddrSpace AddrSpace,
1720                                          Position Pos) const {
1721   if (!InsertCacheInv)
1722     return false;
1723 
1724   bool Changed = false;
1725 
1726   MachineBasicBlock &MBB = *MI->getParent();
1727   DebugLoc DL = MI->getDebugLoc();
1728 
1729   if (Pos == Position::AFTER)
1730     ++MI;
1731 
1732   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1733     switch (Scope) {
1734     case SIAtomicScope::SYSTEM:
1735       // Ensures that following loads will not see stale remote VMEM data or
1736       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1737       // CC will never be stale due to the local memory probes.
1738       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1739           // Set SC bits to indicate system scope.
1740           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1741       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1742       // hardware does not reorder memory operations by the same wave with
1743       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1744       // remove any cache lines of earlier writes by the same wave and ensures
1745       // later reads by the same wave will refetch the cache lines.
1746       Changed = true;
1747       break;
1748     case SIAtomicScope::AGENT:
1749       // Ensures that following loads will not see stale remote date or local
1750       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1751       // due to the memory probes.
1752       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1753           // Set SC bits to indicate agent scope.
1754           .addImm(AMDGPU::CPol::SC1);
1755       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1756       // does not reorder memory operations with respect to preceeding buffer
1757       // invalidate. The invalidate is guaranteed to remove any cache lines of
1758       // earlier writes and ensures later writes will refetch the cache lines.
1759       Changed = true;
1760       break;
1761     case SIAtomicScope::WORKGROUP:
1762       // In threadgroup split mode the waves of a work-group can be executing on
1763       // different CUs. Therefore need to invalidate the L1 which is per CU.
1764       // Otherwise in non-threadgroup split mode all waves of a work-group are
1765       // on the same CU, and so the L1 does not need to be invalidated.
1766       if (ST.isTgSplitEnabled()) {
1767         // Ensures L1 is invalidated if in threadgroup split mode. In
1768         // non-threadgroup split mode it is a NOP, but no point generating it in
1769         // that case if know not in that mode.
1770         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1771             // Set SC bits to indicate work-group scope.
1772             .addImm(AMDGPU::CPol::SC0);
1773         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1774         // does not reorder memory operations with respect to preceeding buffer
1775         // invalidate. The invalidate is guaranteed to remove any cache lines of
1776         // earlier writes and ensures later writes will refetch the cache lines.
1777         Changed = true;
1778       }
1779       break;
1780     case SIAtomicScope::WAVEFRONT:
1781     case SIAtomicScope::SINGLETHREAD:
1782       // Could generate "BUFFER_INV" but it would do nothing as there are no
1783       // caches to invalidate.
1784       break;
1785     default:
1786       llvm_unreachable("Unsupported synchronization scope");
1787     }
1788   }
1789 
1790   /// The scratch address space does not need the global memory cache
1791   /// to be flushed as all memory operations by the same thread are
1792   /// sequentially consistent, and no other thread can access scratch
1793   /// memory.
1794 
1795   /// Other address spaces do not have a cache.
1796 
1797   if (Pos == Position::AFTER)
1798     --MI;
1799 
1800   return Changed;
1801 }
1802 
1803 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1804                                          SIAtomicScope Scope,
1805                                          SIAtomicAddrSpace AddrSpace,
1806                                          bool IsCrossAddrSpaceOrdering,
1807                                          Position Pos) const {
1808   bool Changed = false;
1809 
1810   MachineBasicBlock &MBB = *MI->getParent();
1811   DebugLoc DL = MI->getDebugLoc();
1812 
1813   if (Pos == Position::AFTER)
1814     ++MI;
1815 
1816   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1817     switch (Scope) {
1818     case SIAtomicScope::SYSTEM:
1819       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1820       // hardware does not reorder memory operations by the same wave with
1821       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1822       // to initiate writeback of any dirty cache lines of earlier writes by the
1823       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1824       // writeback has completed.
1825       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1826           // Set SC bits to indicate system scope.
1827           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1828       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1829       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1830       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1831       Changed = true;
1832       break;
1833     case SIAtomicScope::AGENT:
1834       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1835           // Set SC bits to indicate agent scope.
1836           .addImm(AMDGPU::CPol::SC1);
1837 
1838       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1839       // SIAtomicScope::AGENT, the following insertWait will generate the
1840       // required "S_WAITCNT vmcnt(0)".
1841       Changed = true;
1842       break;
1843     case SIAtomicScope::WORKGROUP:
1844     case SIAtomicScope::WAVEFRONT:
1845     case SIAtomicScope::SINGLETHREAD:
1846       // Do not generate "BUFFER_WBL2" as there are no caches it would
1847       // writeback, and would require an otherwise unnecessary
1848       // "S_WAITCNT vmcnt(0)".
1849       break;
1850     default:
1851       llvm_unreachable("Unsupported synchronization scope");
1852     }
1853   }
1854 
1855   if (Pos == Position::AFTER)
1856     --MI;
1857 
1858   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1859   // S_WAITCNT needed.
1860   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1861                         IsCrossAddrSpaceOrdering, Pos);
1862 
1863   return Changed;
1864 }
1865 
1866 bool SIGfx10CacheControl::enableLoadCacheBypass(
1867     const MachineBasicBlock::iterator &MI,
1868     SIAtomicScope Scope,
1869     SIAtomicAddrSpace AddrSpace) const {
1870   assert(MI->mayLoad() && !MI->mayStore());
1871   bool Changed = false;
1872 
1873   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1874     switch (Scope) {
1875     case SIAtomicScope::SYSTEM:
1876     case SIAtomicScope::AGENT:
1877       // Set the L0 and L1 cache policies to MISS_EVICT.
1878       // Note: there is no L2 cache coherent bypass control at the ISA level.
1879       Changed |= enableGLCBit(MI);
1880       Changed |= enableDLCBit(MI);
1881       break;
1882     case SIAtomicScope::WORKGROUP:
1883       // In WGP mode the waves of a work-group can be executing on either CU of
1884       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1885       // CU mode all waves of a work-group are on the same CU, and so the L0
1886       // does not need to be bypassed.
1887       if (!ST.isCuModeEnabled())
1888         Changed |= enableGLCBit(MI);
1889       break;
1890     case SIAtomicScope::WAVEFRONT:
1891     case SIAtomicScope::SINGLETHREAD:
1892       // No cache to bypass.
1893       break;
1894     default:
1895       llvm_unreachable("Unsupported synchronization scope");
1896     }
1897   }
1898 
1899   /// The scratch address space does not need the global memory caches
1900   /// to be bypassed as all memory operations by the same thread are
1901   /// sequentially consistent, and no other thread can access scratch
1902   /// memory.
1903 
1904   /// Other address spaces do not have a cache.
1905 
1906   return Changed;
1907 }
1908 
1909 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1910     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1911     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1912 
1913   // Only handle load and store, not atomic read-modify-write insructions. The
1914   // latter use glc to indicate if the atomic returns a result and so must not
1915   // be used for cache control.
1916   assert(MI->mayLoad() ^ MI->mayStore());
1917 
1918   // Only update load and store, not LLVM IR atomic read-modify-write
1919   // instructions. The latter are always marked as volatile so cannot sensibly
1920   // handle it as do not want to pessimize all atomics. Also they do not support
1921   // the nontemporal attribute.
1922   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1923 
1924   bool Changed = false;
1925 
1926   if (IsVolatile) {
1927     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1928     // and MISS_LRU for store instructions.
1929     // Note: there is no L2 cache coherent bypass control at the ISA level.
1930     if (Op == SIMemOp::LOAD) {
1931       Changed |= enableGLCBit(MI);
1932       Changed |= enableDLCBit(MI);
1933     }
1934 
1935     // Ensure operation has completed at system scope to cause all volatile
1936     // operations to be visible outside the program in a global order. Do not
1937     // request cross address space as only the global address space can be
1938     // observable outside the program, so no need to cause a waitcnt for LDS
1939     // address space operations.
1940     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1941                           Position::AFTER);
1942     return Changed;
1943   }
1944 
1945   if (IsNonTemporal) {
1946     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1947     // and L2 cache policy to STREAM.
1948     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1949     // to MISS_EVICT and the L2 cache policy to STREAM.
1950     if (Op == SIMemOp::STORE)
1951       Changed |= enableGLCBit(MI);
1952     Changed |= enableSLCBit(MI);
1953 
1954     return Changed;
1955   }
1956 
1957   return Changed;
1958 }
1959 
1960 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1961                                      SIAtomicScope Scope,
1962                                      SIAtomicAddrSpace AddrSpace,
1963                                      SIMemOp Op,
1964                                      bool IsCrossAddrSpaceOrdering,
1965                                      Position Pos) const {
1966   bool Changed = false;
1967 
1968   MachineBasicBlock &MBB = *MI->getParent();
1969   DebugLoc DL = MI->getDebugLoc();
1970 
1971   if (Pos == Position::AFTER)
1972     ++MI;
1973 
1974   bool VMCnt = false;
1975   bool VSCnt = false;
1976   bool LGKMCnt = false;
1977 
1978   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1979       SIAtomicAddrSpace::NONE) {
1980     switch (Scope) {
1981     case SIAtomicScope::SYSTEM:
1982     case SIAtomicScope::AGENT:
1983       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1984         VMCnt |= true;
1985       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1986         VSCnt |= true;
1987       break;
1988     case SIAtomicScope::WORKGROUP:
1989       // In WGP mode the waves of a work-group can be executing on either CU of
1990       // the WGP. Therefore need to wait for operations to complete to ensure
1991       // they are visible to waves in the other CU as the L0 is per CU.
1992       // Otherwise in CU mode and all waves of a work-group are on the same CU
1993       // which shares the same L0.
1994       if (!ST.isCuModeEnabled()) {
1995         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1996           VMCnt |= true;
1997         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1998           VSCnt |= true;
1999       }
2000       break;
2001     case SIAtomicScope::WAVEFRONT:
2002     case SIAtomicScope::SINGLETHREAD:
2003       // The L0 cache keeps all memory operations in order for
2004       // work-items in the same wavefront.
2005       break;
2006     default:
2007       llvm_unreachable("Unsupported synchronization scope");
2008     }
2009   }
2010 
2011   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2012     switch (Scope) {
2013     case SIAtomicScope::SYSTEM:
2014     case SIAtomicScope::AGENT:
2015     case SIAtomicScope::WORKGROUP:
2016       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2017       // not needed as LDS operations for all waves are executed in a total
2018       // global ordering as observed by all waves. Required if also
2019       // synchronizing with global/GDS memory as LDS operations could be
2020       // reordered with respect to later global/GDS memory operations of the
2021       // same wave.
2022       LGKMCnt |= IsCrossAddrSpaceOrdering;
2023       break;
2024     case SIAtomicScope::WAVEFRONT:
2025     case SIAtomicScope::SINGLETHREAD:
2026       // The LDS keeps all memory operations in order for
2027       // the same wavefront.
2028       break;
2029     default:
2030       llvm_unreachable("Unsupported synchronization scope");
2031     }
2032   }
2033 
2034   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2035     switch (Scope) {
2036     case SIAtomicScope::SYSTEM:
2037     case SIAtomicScope::AGENT:
2038       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2039       // is not needed as GDS operations for all waves are executed in a total
2040       // global ordering as observed by all waves. Required if also
2041       // synchronizing with global/LDS memory as GDS operations could be
2042       // reordered with respect to later global/LDS memory operations of the
2043       // same wave.
2044       LGKMCnt |= IsCrossAddrSpaceOrdering;
2045       break;
2046     case SIAtomicScope::WORKGROUP:
2047     case SIAtomicScope::WAVEFRONT:
2048     case SIAtomicScope::SINGLETHREAD:
2049       // The GDS keeps all memory operations in order for
2050       // the same work-group.
2051       break;
2052     default:
2053       llvm_unreachable("Unsupported synchronization scope");
2054     }
2055   }
2056 
2057   if (VMCnt || LGKMCnt) {
2058     unsigned WaitCntImmediate =
2059       AMDGPU::encodeWaitcnt(IV,
2060                             VMCnt ? 0 : getVmcntBitMask(IV),
2061                             getExpcntBitMask(IV),
2062                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2063     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2064         .addImm(WaitCntImmediate);
2065     Changed = true;
2066   }
2067 
2068   if (VSCnt) {
2069     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2070         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2071         .addImm(0);
2072     Changed = true;
2073   }
2074 
2075   if (Pos == Position::AFTER)
2076     --MI;
2077 
2078   return Changed;
2079 }
2080 
2081 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2082                                         SIAtomicScope Scope,
2083                                         SIAtomicAddrSpace AddrSpace,
2084                                         Position Pos) const {
2085   if (!InsertCacheInv)
2086     return false;
2087 
2088   bool Changed = false;
2089 
2090   MachineBasicBlock &MBB = *MI->getParent();
2091   DebugLoc DL = MI->getDebugLoc();
2092 
2093   if (Pos == Position::AFTER)
2094     ++MI;
2095 
2096   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2097     switch (Scope) {
2098     case SIAtomicScope::SYSTEM:
2099     case SIAtomicScope::AGENT:
2100       // The order of invalidates matter here. We must invalidate "outer in"
2101       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2102       // invalidated.
2103       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2104       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2105       Changed = true;
2106       break;
2107     case SIAtomicScope::WORKGROUP:
2108       // In WGP mode the waves of a work-group can be executing on either CU of
2109       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2110       // in CU mode and all waves of a work-group are on the same CU, and so the
2111       // L0 does not need to be invalidated.
2112       if (!ST.isCuModeEnabled()) {
2113         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2114         Changed = true;
2115       }
2116       break;
2117     case SIAtomicScope::WAVEFRONT:
2118     case SIAtomicScope::SINGLETHREAD:
2119       // No cache to invalidate.
2120       break;
2121     default:
2122       llvm_unreachable("Unsupported synchronization scope");
2123     }
2124   }
2125 
2126   /// The scratch address space does not need the global memory cache
2127   /// to be flushed as all memory operations by the same thread are
2128   /// sequentially consistent, and no other thread can access scratch
2129   /// memory.
2130 
2131   /// Other address spaces do not have a cache.
2132 
2133   if (Pos == Position::AFTER)
2134     --MI;
2135 
2136   return Changed;
2137 }
2138 
2139 bool SIGfx11CacheControl::enableLoadCacheBypass(
2140     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2141     SIAtomicAddrSpace AddrSpace) const {
2142   assert(MI->mayLoad() && !MI->mayStore());
2143   bool Changed = false;
2144 
2145   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2146     switch (Scope) {
2147     case SIAtomicScope::SYSTEM:
2148     case SIAtomicScope::AGENT:
2149       // Set the L0 and L1 cache policies to MISS_EVICT.
2150       // Note: there is no L2 cache coherent bypass control at the ISA level.
2151       Changed |= enableGLCBit(MI);
2152       break;
2153     case SIAtomicScope::WORKGROUP:
2154       // In WGP mode the waves of a work-group can be executing on either CU of
2155       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2156       // CU mode all waves of a work-group are on the same CU, and so the L0
2157       // does not need to be bypassed.
2158       if (!ST.isCuModeEnabled())
2159         Changed |= enableGLCBit(MI);
2160       break;
2161     case SIAtomicScope::WAVEFRONT:
2162     case SIAtomicScope::SINGLETHREAD:
2163       // No cache to bypass.
2164       break;
2165     default:
2166       llvm_unreachable("Unsupported synchronization scope");
2167     }
2168   }
2169 
2170   /// The scratch address space does not need the global memory caches
2171   /// to be bypassed as all memory operations by the same thread are
2172   /// sequentially consistent, and no other thread can access scratch
2173   /// memory.
2174 
2175   /// Other address spaces do not have a cache.
2176 
2177   return Changed;
2178 }
2179 
2180 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2181     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2182     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2183 
2184   // Only handle load and store, not atomic read-modify-write insructions. The
2185   // latter use glc to indicate if the atomic returns a result and so must not
2186   // be used for cache control.
2187   assert(MI->mayLoad() ^ MI->mayStore());
2188 
2189   // Only update load and store, not LLVM IR atomic read-modify-write
2190   // instructions. The latter are always marked as volatile so cannot sensibly
2191   // handle it as do not want to pessimize all atomics. Also they do not support
2192   // the nontemporal attribute.
2193   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2194 
2195   bool Changed = false;
2196 
2197   if (IsVolatile) {
2198     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2199     // and MISS_LRU for store instructions.
2200     // Note: there is no L2 cache coherent bypass control at the ISA level.
2201     if (Op == SIMemOp::LOAD)
2202       Changed |= enableGLCBit(MI);
2203 
2204     // Set MALL NOALLOC for load and store instructions.
2205     Changed |= enableDLCBit(MI);
2206 
2207     // Ensure operation has completed at system scope to cause all volatile
2208     // operations to be visible outside the program in a global order. Do not
2209     // request cross address space as only the global address space can be
2210     // observable outside the program, so no need to cause a waitcnt for LDS
2211     // address space operations.
2212     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2213                           Position::AFTER);
2214     return Changed;
2215   }
2216 
2217   if (IsNonTemporal) {
2218     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2219     // and L2 cache policy to STREAM.
2220     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2221     // to MISS_EVICT and the L2 cache policy to STREAM.
2222     if (Op == SIMemOp::STORE)
2223       Changed |= enableGLCBit(MI);
2224     Changed |= enableSLCBit(MI);
2225 
2226     // Set MALL NOALLOC for load and store instructions.
2227     Changed |= enableDLCBit(MI);
2228     return Changed;
2229   }
2230 
2231   return Changed;
2232 }
2233 
2234 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2235                                 AMDGPU::CPol::CPol Value) const {
2236   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2237   if (!CPol)
2238     return false;
2239 
2240   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2241   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2242     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2243     return true;
2244   }
2245 
2246   return false;
2247 }
2248 
2249 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2250                                    AMDGPU::CPol::CPol Value) const {
2251   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2252   if (!CPol)
2253     return false;
2254 
2255   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2256   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2257     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2258     return true;
2259   }
2260 
2261   return false;
2262 }
2263 
2264 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2265     const MachineBasicBlock::iterator MI) const {
2266   // TODO: implement flag for frontend to give us a hint not to insert waits.
2267 
2268   MachineBasicBlock &MBB = *MI->getParent();
2269   const DebugLoc &DL = MI->getDebugLoc();
2270 
2271   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2272   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2273   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2274   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2275   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2276 
2277   return true;
2278 }
2279 
2280 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2281                                      SIAtomicScope Scope,
2282                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2283                                      bool IsCrossAddrSpaceOrdering,
2284                                      Position Pos) const {
2285   bool Changed = false;
2286 
2287   MachineBasicBlock &MBB = *MI->getParent();
2288   DebugLoc DL = MI->getDebugLoc();
2289 
2290   bool LOADCnt = false;
2291   bool DSCnt = false;
2292   bool STORECnt = false;
2293 
2294   if (Pos == Position::AFTER)
2295     ++MI;
2296 
2297   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2298       SIAtomicAddrSpace::NONE) {
2299     switch (Scope) {
2300     case SIAtomicScope::SYSTEM:
2301     case SIAtomicScope::AGENT:
2302       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2303         LOADCnt |= true;
2304       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2305         STORECnt |= true;
2306       break;
2307     case SIAtomicScope::WORKGROUP:
2308       // In WGP mode the waves of a work-group can be executing on either CU of
2309       // the WGP. Therefore need to wait for operations to complete to ensure
2310       // they are visible to waves in the other CU as the L0 is per CU.
2311       // Otherwise in CU mode and all waves of a work-group are on the same CU
2312       // which shares the same L0.
2313       if (!ST.isCuModeEnabled()) {
2314         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2315           LOADCnt |= true;
2316         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2317           STORECnt |= true;
2318       }
2319       break;
2320     case SIAtomicScope::WAVEFRONT:
2321     case SIAtomicScope::SINGLETHREAD:
2322       // The L0 cache keeps all memory operations in order for
2323       // work-items in the same wavefront.
2324       break;
2325     default:
2326       llvm_unreachable("Unsupported synchronization scope");
2327     }
2328   }
2329 
2330   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2331     switch (Scope) {
2332     case SIAtomicScope::SYSTEM:
2333     case SIAtomicScope::AGENT:
2334     case SIAtomicScope::WORKGROUP:
2335       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2336       // not needed as LDS operations for all waves are executed in a total
2337       // global ordering as observed by all waves. Required if also
2338       // synchronizing with global/GDS memory as LDS operations could be
2339       // reordered with respect to later global/GDS memory operations of the
2340       // same wave.
2341       DSCnt |= IsCrossAddrSpaceOrdering;
2342       break;
2343     case SIAtomicScope::WAVEFRONT:
2344     case SIAtomicScope::SINGLETHREAD:
2345       // The LDS keeps all memory operations in order for
2346       // the same wavefront.
2347       break;
2348     default:
2349       llvm_unreachable("Unsupported synchronization scope");
2350     }
2351   }
2352 
2353   if (LOADCnt) {
2354     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2355     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2356     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2357     Changed = true;
2358   }
2359 
2360   if (STORECnt) {
2361     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2362     Changed = true;
2363   }
2364 
2365   if (DSCnt) {
2366     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2367     Changed = true;
2368   }
2369 
2370   if (Pos == Position::AFTER)
2371     --MI;
2372 
2373   return Changed;
2374 }
2375 
2376 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2377                                         SIAtomicScope Scope,
2378                                         SIAtomicAddrSpace AddrSpace,
2379                                         Position Pos) const {
2380   if (!InsertCacheInv)
2381     return false;
2382 
2383   MachineBasicBlock &MBB = *MI->getParent();
2384   DebugLoc DL = MI->getDebugLoc();
2385 
2386   /// The scratch address space does not need the global memory cache
2387   /// to be flushed as all memory operations by the same thread are
2388   /// sequentially consistent, and no other thread can access scratch
2389   /// memory.
2390 
2391   /// Other address spaces do not have a cache.
2392   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2393     return false;
2394 
2395   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2396   switch (Scope) {
2397   case SIAtomicScope::SYSTEM:
2398     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2399     break;
2400   case SIAtomicScope::AGENT:
2401     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2402     break;
2403   case SIAtomicScope::WORKGROUP:
2404     // In WGP mode the waves of a work-group can be executing on either CU of
2405     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2406     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2407     // the L0 does not need to be invalidated.
2408     if (ST.isCuModeEnabled())
2409       return false;
2410 
2411     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2412     break;
2413   case SIAtomicScope::WAVEFRONT:
2414   case SIAtomicScope::SINGLETHREAD:
2415     // No cache to invalidate.
2416     return false;
2417   default:
2418     llvm_unreachable("Unsupported synchronization scope");
2419   }
2420 
2421   if (Pos == Position::AFTER)
2422     ++MI;
2423 
2424   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2425 
2426   if (Pos == Position::AFTER)
2427     --MI;
2428 
2429   return true;
2430 }
2431 
2432 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2433     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2434     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2435 
2436   // Only handle load and store, not atomic read-modify-write instructions.
2437   assert(MI->mayLoad() ^ MI->mayStore());
2438 
2439   // Only update load and store, not LLVM IR atomic read-modify-write
2440   // instructions. The latter are always marked as volatile so cannot sensibly
2441   // handle it as do not want to pessimize all atomics. Also they do not support
2442   // the nontemporal attribute.
2443   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2444 
2445   bool Changed = false;
2446 
2447   if (IsLastUse) {
2448     // Set last-use hint.
2449     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2450   } else if (IsNonTemporal) {
2451     // Set non-temporal hint for all cache levels.
2452     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2453   }
2454 
2455   if (IsVolatile) {
2456     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2457 
2458     if (Op == SIMemOp::STORE)
2459       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2460 
2461     // Ensure operation has completed at system scope to cause all volatile
2462     // operations to be visible outside the program in a global order. Do not
2463     // request cross address space as only the global address space can be
2464     // observable outside the program, so no need to cause a waitcnt for LDS
2465     // address space operations.
2466     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2467                           Position::AFTER);
2468   }
2469 
2470   return Changed;
2471 }
2472 
2473 bool SIGfx12CacheControl::expandSystemScopeStore(
2474     MachineBasicBlock::iterator &MI) const {
2475   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2476   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2477     return insertWaitsBeforeSystemScopeStore(MI);
2478 
2479   return false;
2480 }
2481 
2482 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2483   if (AtomicPseudoMIs.empty())
2484     return false;
2485 
2486   for (auto &MI : AtomicPseudoMIs)
2487     MI->eraseFromParent();
2488 
2489   AtomicPseudoMIs.clear();
2490   return true;
2491 }
2492 
2493 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2494                                    MachineBasicBlock::iterator &MI) {
2495   assert(MI->mayLoad() && !MI->mayStore());
2496 
2497   bool Changed = false;
2498 
2499   if (MOI.isAtomic()) {
2500     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2501         MOI.getOrdering() == AtomicOrdering::Acquire ||
2502         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2503       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2504                                            MOI.getOrderingAddrSpace());
2505     }
2506 
2507     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2508       Changed |= CC->insertWait(MI, MOI.getScope(),
2509                                 MOI.getOrderingAddrSpace(),
2510                                 SIMemOp::LOAD | SIMemOp::STORE,
2511                                 MOI.getIsCrossAddressSpaceOrdering(),
2512                                 Position::BEFORE);
2513 
2514     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2515         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2516       Changed |= CC->insertWait(MI, MOI.getScope(),
2517                                 MOI.getInstrAddrSpace(),
2518                                 SIMemOp::LOAD,
2519                                 MOI.getIsCrossAddressSpaceOrdering(),
2520                                 Position::AFTER);
2521       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2522                                    MOI.getOrderingAddrSpace(),
2523                                    Position::AFTER);
2524     }
2525 
2526     return Changed;
2527   }
2528 
2529   // Atomic instructions already bypass caches to the scope specified by the
2530   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2531   // instructions need additional treatment.
2532   Changed |= CC->enableVolatileAndOrNonTemporal(
2533       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2534       MOI.isNonTemporal(), MOI.isLastUse());
2535 
2536   return Changed;
2537 }
2538 
2539 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2540                                     MachineBasicBlock::iterator &MI) {
2541   assert(!MI->mayLoad() && MI->mayStore());
2542 
2543   bool Changed = false;
2544 
2545   if (MOI.isAtomic()) {
2546     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2547         MOI.getOrdering() == AtomicOrdering::Release ||
2548         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2549       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2550                                             MOI.getOrderingAddrSpace());
2551     }
2552 
2553     if (MOI.getOrdering() == AtomicOrdering::Release ||
2554         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2555       Changed |= CC->insertRelease(MI, MOI.getScope(),
2556                                    MOI.getOrderingAddrSpace(),
2557                                    MOI.getIsCrossAddressSpaceOrdering(),
2558                                    Position::BEFORE);
2559 
2560     return Changed;
2561   }
2562 
2563   // Atomic instructions already bypass caches to the scope specified by the
2564   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2565   // need additional treatment.
2566   Changed |= CC->enableVolatileAndOrNonTemporal(
2567       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2568       MOI.isNonTemporal());
2569 
2570   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2571   // instruction field, do not confuse it with atomic scope.
2572   Changed |= CC->expandSystemScopeStore(MI);
2573   return Changed;
2574 }
2575 
2576 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2577                                           MachineBasicBlock::iterator &MI) {
2578   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2579 
2580   AtomicPseudoMIs.push_back(MI);
2581   bool Changed = false;
2582 
2583   // Refine fenced address space based on MMRAs.
2584   //
2585   // TODO: Should we support this MMRA on other atomic operations?
2586   auto OrderingAddrSpace =
2587       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2588 
2589   if (MOI.isAtomic()) {
2590     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2591       Changed |= CC->insertWait(
2592           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2593           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
2594 
2595     if (MOI.getOrdering() == AtomicOrdering::Release ||
2596         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2597         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2598       /// TODO: This relies on a barrier always generating a waitcnt
2599       /// for LDS to ensure it is not reordered with the completion of
2600       /// the proceeding LDS operations. If barrier had a memory
2601       /// ordering and memory scope, then library does not need to
2602       /// generate a fence. Could add support in this file for
2603       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2604       /// adding S_WAITCNT before a S_BARRIER.
2605       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2606                                    MOI.getIsCrossAddressSpaceOrdering(),
2607                                    Position::BEFORE);
2608 
2609     // TODO: If both release and invalidate are happening they could be combined
2610     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2611     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2612     // track cache invalidate and write back instructions.
2613 
2614     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2615         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2616         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2617       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2618                                    Position::BEFORE);
2619 
2620     return Changed;
2621   }
2622 
2623   return Changed;
2624 }
2625 
2626 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2627   MachineBasicBlock::iterator &MI) {
2628   assert(MI->mayLoad() && MI->mayStore());
2629 
2630   bool Changed = false;
2631 
2632   if (MOI.isAtomic()) {
2633     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2634         MOI.getOrdering() == AtomicOrdering::Acquire ||
2635         MOI.getOrdering() == AtomicOrdering::Release ||
2636         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2637         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2638       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2639                                           MOI.getInstrAddrSpace());
2640     }
2641 
2642     if (MOI.getOrdering() == AtomicOrdering::Release ||
2643         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2644         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2645         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2646       Changed |= CC->insertRelease(MI, MOI.getScope(),
2647                                    MOI.getOrderingAddrSpace(),
2648                                    MOI.getIsCrossAddressSpaceOrdering(),
2649                                    Position::BEFORE);
2650 
2651     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2652         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2653         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2654         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2655         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2656       Changed |= CC->insertWait(MI, MOI.getScope(),
2657                                 MOI.getInstrAddrSpace(),
2658                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2659                                                    SIMemOp::STORE,
2660                                 MOI.getIsCrossAddressSpaceOrdering(),
2661                                 Position::AFTER);
2662       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2663                                    MOI.getOrderingAddrSpace(),
2664                                    Position::AFTER);
2665     }
2666 
2667     return Changed;
2668   }
2669 
2670   return Changed;
2671 }
2672 
2673 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2674   bool Changed = false;
2675 
2676   SIMemOpAccess MOA(MF);
2677   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2678 
2679   for (auto &MBB : MF) {
2680     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2681 
2682       // Unbundle instructions after the post-RA scheduler.
2683       if (MI->isBundle() && MI->mayLoadOrStore()) {
2684         MachineBasicBlock::instr_iterator II(MI->getIterator());
2685         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2686              I != E && I->isBundledWithPred(); ++I) {
2687           I->unbundleFromPred();
2688           for (MachineOperand &MO : I->operands())
2689             if (MO.isReg())
2690               MO.setIsInternalRead(false);
2691         }
2692 
2693         MI->eraseFromParent();
2694         MI = II->getIterator();
2695       }
2696 
2697       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2698         continue;
2699 
2700       if (const auto &MOI = MOA.getLoadInfo(MI))
2701         Changed |= expandLoad(*MOI, MI);
2702       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2703         Changed |= expandStore(*MOI, MI);
2704         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2705       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2706         Changed |= expandAtomicFence(*MOI, MI);
2707       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2708         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2709     }
2710   }
2711 
2712   Changed |= removeAtomicPseudoMIs();
2713   return Changed;
2714 }
2715 
2716 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2717 
2718 char SIMemoryLegalizer::ID = 0;
2719 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2720 
2721 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2722   return new SIMemoryLegalizer();
2723 }
2724