xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision ef067f52044042fbe1b6fa21a90bfdbcf1622b02)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
355                                    MachineBasicBlock::iterator &MI) const {
356     return false;
357   }
358 };
359 
360 class SIGfx6CacheControl : public SICacheControl {
361 protected:
362 
363   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::GLC);
367   }
368 
369   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit(MI, AMDGPU::CPol::SLC);
373   }
374 
375 public:
376 
377   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378 
379   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380                              SIAtomicScope Scope,
381                              SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384                               SIAtomicScope Scope,
385                               SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388                             SIAtomicScope Scope,
389                             SIAtomicAddrSpace AddrSpace) const override;
390 
391   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393                                       bool IsVolatile,
394                                       bool IsNonTemporal) const override;
395 
396   bool insertWait(MachineBasicBlock::iterator &MI,
397                   SIAtomicScope Scope,
398                   SIAtomicAddrSpace AddrSpace,
399                   SIMemOp Op,
400                   bool IsCrossAddrSpaceOrdering,
401                   Position Pos) const override;
402 
403   bool insertAcquire(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      Position Pos) const override;
407 
408   bool insertRelease(MachineBasicBlock::iterator &MI,
409                      SIAtomicScope Scope,
410                      SIAtomicAddrSpace AddrSpace,
411                      bool IsCrossAddrSpaceOrdering,
412                      Position Pos) const override;
413 };
414 
415 class SIGfx7CacheControl : public SIGfx6CacheControl {
416 public:
417 
418   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419 
420   bool insertAcquire(MachineBasicBlock::iterator &MI,
421                      SIAtomicScope Scope,
422                      SIAtomicAddrSpace AddrSpace,
423                      Position Pos) const override;
424 
425 };
426 
427 class SIGfx90ACacheControl : public SIGfx7CacheControl {
428 public:
429 
430   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431 
432   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433                              SIAtomicScope Scope,
434                              SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437                               SIAtomicScope Scope,
438                               SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441                             SIAtomicScope Scope,
442                             SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446                                       bool IsVolatile,
447                                       bool IsNonTemporal) const override;
448 
449   bool insertWait(MachineBasicBlock::iterator &MI,
450                   SIAtomicScope Scope,
451                   SIAtomicAddrSpace AddrSpace,
452                   SIMemOp Op,
453                   bool IsCrossAddrSpaceOrdering,
454                   Position Pos) const override;
455 
456   bool insertAcquire(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      Position Pos) const override;
460 
461   bool insertRelease(MachineBasicBlock::iterator &MI,
462                      SIAtomicScope Scope,
463                      SIAtomicAddrSpace AddrSpace,
464                      bool IsCrossAddrSpaceOrdering,
465                      Position Pos) const override;
466 };
467 
468 class SIGfx940CacheControl : public SIGfx90ACacheControl {
469 protected:
470 
471   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472   /// is modified, false otherwise.
473   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474     return enableNamedBit(MI, AMDGPU::CPol::SC0);
475   }
476 
477   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478   /// is modified, false otherwise.
479   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480     return enableNamedBit(MI, AMDGPU::CPol::SC1);
481   }
482 
483   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484   /// is modified, false otherwise.
485   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486     return enableNamedBit(MI, AMDGPU::CPol::NT);
487   }
488 
489 public:
490 
491   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492 
493   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494                              SIAtomicScope Scope,
495                              SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498                               SIAtomicScope Scope,
499                               SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502                             SIAtomicScope Scope,
503                             SIAtomicAddrSpace AddrSpace) const override;
504 
505   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507                                       bool IsVolatile,
508                                       bool IsNonTemporal) const override;
509 
510   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512 
513   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515                      Position Pos) const override;
516 
517   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518                            MachineBasicBlock::iterator &MI) const override {
519     bool Changed = false;
520     if (ST.hasForceStoreSC0SC1() &&
521         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522                                     SIAtomicAddrSpace::GLOBAL |
523                                     SIAtomicAddrSpace::OTHER)) !=
524          SIAtomicAddrSpace::NONE) {
525       Changed |= enableSC0Bit(MI);
526       Changed |= enableSC1Bit(MI);
527     }
528     return Changed;
529   }
530 };
531 
532 class SIGfx10CacheControl : public SIGfx7CacheControl {
533 protected:
534 
535   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536   /// is modified, false otherwise.
537   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538     return enableNamedBit(MI, AMDGPU::CPol::DLC);
539   }
540 
541 public:
542 
543   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544 
545   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546                              SIAtomicScope Scope,
547                              SIAtomicAddrSpace AddrSpace) const override;
548 
549   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551                                       bool IsVolatile,
552                                       bool IsNonTemporal) const override;
553 
554   bool insertWait(MachineBasicBlock::iterator &MI,
555                   SIAtomicScope Scope,
556                   SIAtomicAddrSpace AddrSpace,
557                   SIMemOp Op,
558                   bool IsCrossAddrSpaceOrdering,
559                   Position Pos) const override;
560 
561   bool insertAcquire(MachineBasicBlock::iterator &MI,
562                      SIAtomicScope Scope,
563                      SIAtomicAddrSpace AddrSpace,
564                      Position Pos) const override;
565 };
566 
567 class SIGfx11CacheControl : public SIGfx10CacheControl {
568 public:
569   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570 
571   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572                              SIAtomicScope Scope,
573                              SIAtomicAddrSpace AddrSpace) const override;
574 
575   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577                                       bool IsVolatile,
578                                       bool IsNonTemporal) const override;
579 };
580 
581 class SIMemoryLegalizer final : public MachineFunctionPass {
582 private:
583 
584   /// Cache Control.
585   std::unique_ptr<SICacheControl> CC = nullptr;
586 
587   /// List of atomic pseudo instructions.
588   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
589 
590   /// Return true iff instruction \p MI is a atomic instruction that
591   /// returns a result.
592   bool isAtomicRet(const MachineInstr &MI) const {
593     return SIInstrInfo::isAtomicRet(MI);
594   }
595 
596   /// Removes all processed atomic pseudo instructions from the current
597   /// function. Returns true if current function is modified, false otherwise.
598   bool removeAtomicPseudoMIs();
599 
600   /// Expands load operation \p MI. Returns true if instructions are
601   /// added/deleted or \p MI is modified, false otherwise.
602   bool expandLoad(const SIMemOpInfo &MOI,
603                   MachineBasicBlock::iterator &MI);
604   /// Expands store operation \p MI. Returns true if instructions are
605   /// added/deleted or \p MI is modified, false otherwise.
606   bool expandStore(const SIMemOpInfo &MOI,
607                    MachineBasicBlock::iterator &MI);
608   /// Expands atomic fence operation \p MI. Returns true if
609   /// instructions are added/deleted or \p MI is modified, false otherwise.
610   bool expandAtomicFence(const SIMemOpInfo &MOI,
611                          MachineBasicBlock::iterator &MI);
612   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
613   /// instructions are added/deleted or \p MI is modified, false otherwise.
614   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
615                                 MachineBasicBlock::iterator &MI);
616 
617 public:
618   static char ID;
619 
620   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
621 
622   void getAnalysisUsage(AnalysisUsage &AU) const override {
623     AU.setPreservesCFG();
624     MachineFunctionPass::getAnalysisUsage(AU);
625   }
626 
627   StringRef getPassName() const override {
628     return PASS_NAME;
629   }
630 
631   bool runOnMachineFunction(MachineFunction &MF) override;
632 };
633 
634 } // end namespace anonymous
635 
636 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
637                                       const char *Msg) const {
638   const Function &Func = MI->getParent()->getParent()->getFunction();
639   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
640   Func.getContext().diagnose(Diag);
641 }
642 
643 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
644 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
645                                SIAtomicAddrSpace InstrAddrSpace) const {
646   if (SSID == SyncScope::System)
647     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
648   if (SSID == MMI->getAgentSSID())
649     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
650   if (SSID == MMI->getWorkgroupSSID())
651     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
652                       true);
653   if (SSID == MMI->getWavefrontSSID())
654     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
655                       true);
656   if (SSID == SyncScope::SingleThread)
657     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
658                       true);
659   if (SSID == MMI->getSystemOneAddressSpaceSSID())
660     return std::tuple(SIAtomicScope::SYSTEM,
661                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
662   if (SSID == MMI->getAgentOneAddressSpaceSSID())
663     return std::tuple(SIAtomicScope::AGENT,
664                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
665   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
666     return std::tuple(SIAtomicScope::WORKGROUP,
667                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
668   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
669     return std::tuple(SIAtomicScope::WAVEFRONT,
670                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
671   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
672     return std::tuple(SIAtomicScope::SINGLETHREAD,
673                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
674   return std::nullopt;
675 }
676 
677 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
678   if (AS == AMDGPUAS::FLAT_ADDRESS)
679     return SIAtomicAddrSpace::FLAT;
680   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
681     return SIAtomicAddrSpace::GLOBAL;
682   if (AS == AMDGPUAS::LOCAL_ADDRESS)
683     return SIAtomicAddrSpace::LDS;
684   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
685     return SIAtomicAddrSpace::SCRATCH;
686   if (AS == AMDGPUAS::REGION_ADDRESS)
687     return SIAtomicAddrSpace::GDS;
688 
689   return SIAtomicAddrSpace::OTHER;
690 }
691 
692 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
693   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
694 }
695 
696 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
697     const MachineBasicBlock::iterator &MI) const {
698   assert(MI->getNumMemOperands() > 0);
699 
700   SyncScope::ID SSID = SyncScope::SingleThread;
701   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
702   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
703   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
704   bool IsNonTemporal = true;
705   bool IsVolatile = false;
706 
707   // Validator should check whether or not MMOs cover the entire set of
708   // locations accessed by the memory instruction.
709   for (const auto &MMO : MI->memoperands()) {
710     IsNonTemporal &= MMO->isNonTemporal();
711     IsVolatile |= MMO->isVolatile();
712     InstrAddrSpace |=
713       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
714     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
715     if (OpOrdering != AtomicOrdering::NotAtomic) {
716       const auto &IsSyncScopeInclusion =
717           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
718       if (!IsSyncScopeInclusion) {
719         reportUnsupported(MI,
720           "Unsupported non-inclusive atomic synchronization scope");
721         return std::nullopt;
722       }
723 
724       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
725       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
726       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
727              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
728       FailureOrdering =
729           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
730     }
731   }
732 
733   SIAtomicScope Scope = SIAtomicScope::NONE;
734   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
735   bool IsCrossAddressSpaceOrdering = false;
736   if (Ordering != AtomicOrdering::NotAtomic) {
737     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
738     if (!ScopeOrNone) {
739       reportUnsupported(MI, "Unsupported atomic synchronization scope");
740       return std::nullopt;
741     }
742     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
743         *ScopeOrNone;
744     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
745         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
746         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
747       reportUnsupported(MI, "Unsupported atomic address space");
748       return std::nullopt;
749     }
750   }
751   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
752                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
753                      IsNonTemporal);
754 }
755 
756 std::optional<SIMemOpInfo>
757 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
758   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
759 
760   if (!(MI->mayLoad() && !MI->mayStore()))
761     return std::nullopt;
762 
763   // Be conservative if there are no memory operands.
764   if (MI->getNumMemOperands() == 0)
765     return SIMemOpInfo();
766 
767   return constructFromMIWithMMO(MI);
768 }
769 
770 std::optional<SIMemOpInfo>
771 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
772   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
773 
774   if (!(!MI->mayLoad() && MI->mayStore()))
775     return std::nullopt;
776 
777   // Be conservative if there are no memory operands.
778   if (MI->getNumMemOperands() == 0)
779     return SIMemOpInfo();
780 
781   return constructFromMIWithMMO(MI);
782 }
783 
784 std::optional<SIMemOpInfo>
785 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
786   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
787 
788   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
789     return std::nullopt;
790 
791   AtomicOrdering Ordering =
792     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
793 
794   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
795   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
796   if (!ScopeOrNone) {
797     reportUnsupported(MI, "Unsupported atomic synchronization scope");
798     return std::nullopt;
799   }
800 
801   SIAtomicScope Scope = SIAtomicScope::NONE;
802   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
803   bool IsCrossAddressSpaceOrdering = false;
804   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
805       *ScopeOrNone;
806 
807   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
808       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
809     reportUnsupported(MI, "Unsupported atomic address space");
810     return std::nullopt;
811   }
812 
813   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
814                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
815 }
816 
817 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
818     const MachineBasicBlock::iterator &MI) const {
819   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
820 
821   if (!(MI->mayLoad() && MI->mayStore()))
822     return std::nullopt;
823 
824   // Be conservative if there are no memory operands.
825   if (MI->getNumMemOperands() == 0)
826     return SIMemOpInfo();
827 
828   return constructFromMIWithMMO(MI);
829 }
830 
831 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
832   TII = ST.getInstrInfo();
833   IV = getIsaVersion(ST.getCPU());
834   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
835 }
836 
837 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
838                                     AMDGPU::CPol::CPol Bit) const {
839   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
840   if (!CPol)
841     return false;
842 
843   CPol->setImm(CPol->getImm() | Bit);
844   return true;
845 }
846 
847 /* static */
848 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
849   GCNSubtarget::Generation Generation = ST.getGeneration();
850   if (ST.hasGFX940Insts())
851     return std::make_unique<SIGfx940CacheControl>(ST);
852   if (ST.hasGFX90AInsts())
853     return std::make_unique<SIGfx90ACacheControl>(ST);
854   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
855     return std::make_unique<SIGfx6CacheControl>(ST);
856   if (Generation < AMDGPUSubtarget::GFX10)
857     return std::make_unique<SIGfx7CacheControl>(ST);
858   if (Generation < AMDGPUSubtarget::GFX11)
859     return std::make_unique<SIGfx10CacheControl>(ST);
860   return std::make_unique<SIGfx11CacheControl>(ST);
861 }
862 
863 bool SIGfx6CacheControl::enableLoadCacheBypass(
864     const MachineBasicBlock::iterator &MI,
865     SIAtomicScope Scope,
866     SIAtomicAddrSpace AddrSpace) const {
867   assert(MI->mayLoad() && !MI->mayStore());
868   bool Changed = false;
869 
870   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
871     switch (Scope) {
872     case SIAtomicScope::SYSTEM:
873     case SIAtomicScope::AGENT:
874       // Set L1 cache policy to MISS_EVICT.
875       // Note: there is no L2 cache bypass policy at the ISA level.
876       Changed |= enableGLCBit(MI);
877       break;
878     case SIAtomicScope::WORKGROUP:
879     case SIAtomicScope::WAVEFRONT:
880     case SIAtomicScope::SINGLETHREAD:
881       // No cache to bypass.
882       break;
883     default:
884       llvm_unreachable("Unsupported synchronization scope");
885     }
886   }
887 
888   /// The scratch address space does not need the global memory caches
889   /// to be bypassed as all memory operations by the same thread are
890   /// sequentially consistent, and no other thread can access scratch
891   /// memory.
892 
893   /// Other address spaces do not have a cache.
894 
895   return Changed;
896 }
897 
898 bool SIGfx6CacheControl::enableStoreCacheBypass(
899     const MachineBasicBlock::iterator &MI,
900     SIAtomicScope Scope,
901     SIAtomicAddrSpace AddrSpace) const {
902   assert(!MI->mayLoad() && MI->mayStore());
903   bool Changed = false;
904 
905   /// The L1 cache is write through so does not need to be bypassed. There is no
906   /// bypass control for the L2 cache at the isa level.
907 
908   return Changed;
909 }
910 
911 bool SIGfx6CacheControl::enableRMWCacheBypass(
912     const MachineBasicBlock::iterator &MI,
913     SIAtomicScope Scope,
914     SIAtomicAddrSpace AddrSpace) const {
915   assert(MI->mayLoad() && MI->mayStore());
916   bool Changed = false;
917 
918   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
919   /// bypassed, and the GLC bit is instead used to indicate if they are
920   /// return or no-return.
921   /// Note: there is no L2 cache coherent bypass control at the ISA level.
922 
923   return Changed;
924 }
925 
926 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
927     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
928     bool IsVolatile, bool IsNonTemporal) const {
929   // Only handle load and store, not atomic read-modify-write insructions. The
930   // latter use glc to indicate if the atomic returns a result and so must not
931   // be used for cache control.
932   assert(MI->mayLoad() ^ MI->mayStore());
933 
934   // Only update load and store, not LLVM IR atomic read-modify-write
935   // instructions. The latter are always marked as volatile so cannot sensibly
936   // handle it as do not want to pessimize all atomics. Also they do not support
937   // the nontemporal attribute.
938   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
939 
940   bool Changed = false;
941 
942   if (IsVolatile) {
943     // Set L1 cache policy to be MISS_EVICT for load instructions
944     // and MISS_LRU for store instructions.
945     // Note: there is no L2 cache bypass policy at the ISA level.
946     if (Op == SIMemOp::LOAD)
947       Changed |= enableGLCBit(MI);
948 
949     // Ensure operation has completed at system scope to cause all volatile
950     // operations to be visible outside the program in a global order. Do not
951     // request cross address space as only the global address space can be
952     // observable outside the program, so no need to cause a waitcnt for LDS
953     // address space operations.
954     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
955                           Position::AFTER);
956 
957     return Changed;
958   }
959 
960   if (IsNonTemporal) {
961     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
962     // for both loads and stores, and the L2 cache policy to STREAM.
963     Changed |= enableGLCBit(MI);
964     Changed |= enableSLCBit(MI);
965     return Changed;
966   }
967 
968   return Changed;
969 }
970 
971 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
972                                     SIAtomicScope Scope,
973                                     SIAtomicAddrSpace AddrSpace,
974                                     SIMemOp Op,
975                                     bool IsCrossAddrSpaceOrdering,
976                                     Position Pos) const {
977   bool Changed = false;
978 
979   MachineBasicBlock &MBB = *MI->getParent();
980   DebugLoc DL = MI->getDebugLoc();
981 
982   if (Pos == Position::AFTER)
983     ++MI;
984 
985   bool VMCnt = false;
986   bool LGKMCnt = false;
987 
988   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
989       SIAtomicAddrSpace::NONE) {
990     switch (Scope) {
991     case SIAtomicScope::SYSTEM:
992     case SIAtomicScope::AGENT:
993       VMCnt |= true;
994       break;
995     case SIAtomicScope::WORKGROUP:
996     case SIAtomicScope::WAVEFRONT:
997     case SIAtomicScope::SINGLETHREAD:
998       // The L1 cache keeps all memory operations in order for
999       // wavefronts in the same work-group.
1000       break;
1001     default:
1002       llvm_unreachable("Unsupported synchronization scope");
1003     }
1004   }
1005 
1006   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1007     switch (Scope) {
1008     case SIAtomicScope::SYSTEM:
1009     case SIAtomicScope::AGENT:
1010     case SIAtomicScope::WORKGROUP:
1011       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1012       // not needed as LDS operations for all waves are executed in a total
1013       // global ordering as observed by all waves. Required if also
1014       // synchronizing with global/GDS memory as LDS operations could be
1015       // reordered with respect to later global/GDS memory operations of the
1016       // same wave.
1017       LGKMCnt |= IsCrossAddrSpaceOrdering;
1018       break;
1019     case SIAtomicScope::WAVEFRONT:
1020     case SIAtomicScope::SINGLETHREAD:
1021       // The LDS keeps all memory operations in order for
1022       // the same wavefront.
1023       break;
1024     default:
1025       llvm_unreachable("Unsupported synchronization scope");
1026     }
1027   }
1028 
1029   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1030     switch (Scope) {
1031     case SIAtomicScope::SYSTEM:
1032     case SIAtomicScope::AGENT:
1033       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1034       // is not needed as GDS operations for all waves are executed in a total
1035       // global ordering as observed by all waves. Required if also
1036       // synchronizing with global/LDS memory as GDS operations could be
1037       // reordered with respect to later global/LDS memory operations of the
1038       // same wave.
1039       LGKMCnt |= IsCrossAddrSpaceOrdering;
1040       break;
1041     case SIAtomicScope::WORKGROUP:
1042     case SIAtomicScope::WAVEFRONT:
1043     case SIAtomicScope::SINGLETHREAD:
1044       // The GDS keeps all memory operations in order for
1045       // the same work-group.
1046       break;
1047     default:
1048       llvm_unreachable("Unsupported synchronization scope");
1049     }
1050   }
1051 
1052   if (VMCnt || LGKMCnt) {
1053     unsigned WaitCntImmediate =
1054       AMDGPU::encodeWaitcnt(IV,
1055                             VMCnt ? 0 : getVmcntBitMask(IV),
1056                             getExpcntBitMask(IV),
1057                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1058     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1059         .addImm(WaitCntImmediate);
1060     Changed = true;
1061   }
1062 
1063   if (Pos == Position::AFTER)
1064     --MI;
1065 
1066   return Changed;
1067 }
1068 
1069 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1070                                        SIAtomicScope Scope,
1071                                        SIAtomicAddrSpace AddrSpace,
1072                                        Position Pos) const {
1073   if (!InsertCacheInv)
1074     return false;
1075 
1076   bool Changed = false;
1077 
1078   MachineBasicBlock &MBB = *MI->getParent();
1079   DebugLoc DL = MI->getDebugLoc();
1080 
1081   if (Pos == Position::AFTER)
1082     ++MI;
1083 
1084   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1085     switch (Scope) {
1086     case SIAtomicScope::SYSTEM:
1087     case SIAtomicScope::AGENT:
1088       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1089       Changed = true;
1090       break;
1091     case SIAtomicScope::WORKGROUP:
1092     case SIAtomicScope::WAVEFRONT:
1093     case SIAtomicScope::SINGLETHREAD:
1094       // No cache to invalidate.
1095       break;
1096     default:
1097       llvm_unreachable("Unsupported synchronization scope");
1098     }
1099   }
1100 
1101   /// The scratch address space does not need the global memory cache
1102   /// to be flushed as all memory operations by the same thread are
1103   /// sequentially consistent, and no other thread can access scratch
1104   /// memory.
1105 
1106   /// Other address spaces do not have a cache.
1107 
1108   if (Pos == Position::AFTER)
1109     --MI;
1110 
1111   return Changed;
1112 }
1113 
1114 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1115                                        SIAtomicScope Scope,
1116                                        SIAtomicAddrSpace AddrSpace,
1117                                        bool IsCrossAddrSpaceOrdering,
1118                                        Position Pos) const {
1119   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1120                     IsCrossAddrSpaceOrdering, Pos);
1121 }
1122 
1123 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1124                                        SIAtomicScope Scope,
1125                                        SIAtomicAddrSpace AddrSpace,
1126                                        Position Pos) const {
1127   if (!InsertCacheInv)
1128     return false;
1129 
1130   bool Changed = false;
1131 
1132   MachineBasicBlock &MBB = *MI->getParent();
1133   DebugLoc DL = MI->getDebugLoc();
1134 
1135   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1136 
1137   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1138                                     ? AMDGPU::BUFFER_WBINVL1
1139                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1140 
1141   if (Pos == Position::AFTER)
1142     ++MI;
1143 
1144   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1145     switch (Scope) {
1146     case SIAtomicScope::SYSTEM:
1147     case SIAtomicScope::AGENT:
1148       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1149       Changed = true;
1150       break;
1151     case SIAtomicScope::WORKGROUP:
1152     case SIAtomicScope::WAVEFRONT:
1153     case SIAtomicScope::SINGLETHREAD:
1154       // No cache to invalidate.
1155       break;
1156     default:
1157       llvm_unreachable("Unsupported synchronization scope");
1158     }
1159   }
1160 
1161   /// The scratch address space does not need the global memory cache
1162   /// to be flushed as all memory operations by the same thread are
1163   /// sequentially consistent, and no other thread can access scratch
1164   /// memory.
1165 
1166   /// Other address spaces do not have a cache.
1167 
1168   if (Pos == Position::AFTER)
1169     --MI;
1170 
1171   return Changed;
1172 }
1173 
1174 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1175     const MachineBasicBlock::iterator &MI,
1176     SIAtomicScope Scope,
1177     SIAtomicAddrSpace AddrSpace) const {
1178   assert(MI->mayLoad() && !MI->mayStore());
1179   bool Changed = false;
1180 
1181   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1182     switch (Scope) {
1183     case SIAtomicScope::SYSTEM:
1184     case SIAtomicScope::AGENT:
1185       // Set the L1 cache policy to MISS_LRU.
1186       // Note: there is no L2 cache bypass policy at the ISA level.
1187       Changed |= enableGLCBit(MI);
1188       break;
1189     case SIAtomicScope::WORKGROUP:
1190       // In threadgroup split mode the waves of a work-group can be executing on
1191       // different CUs. Therefore need to bypass the L1 which is per CU.
1192       // Otherwise in non-threadgroup split mode all waves of a work-group are
1193       // on the same CU, and so the L1 does not need to be bypassed.
1194       if (ST.isTgSplitEnabled())
1195         Changed |= enableGLCBit(MI);
1196       break;
1197     case SIAtomicScope::WAVEFRONT:
1198     case SIAtomicScope::SINGLETHREAD:
1199       // No cache to bypass.
1200       break;
1201     default:
1202       llvm_unreachable("Unsupported synchronization scope");
1203     }
1204   }
1205 
1206   /// The scratch address space does not need the global memory caches
1207   /// to be bypassed as all memory operations by the same thread are
1208   /// sequentially consistent, and no other thread can access scratch
1209   /// memory.
1210 
1211   /// Other address spaces do not have a cache.
1212 
1213   return Changed;
1214 }
1215 
1216 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1217     const MachineBasicBlock::iterator &MI,
1218     SIAtomicScope Scope,
1219     SIAtomicAddrSpace AddrSpace) const {
1220   assert(!MI->mayLoad() && MI->mayStore());
1221   bool Changed = false;
1222 
1223   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1224     switch (Scope) {
1225     case SIAtomicScope::SYSTEM:
1226     case SIAtomicScope::AGENT:
1227       /// Do not set glc for store atomic operations as they implicitly write
1228       /// through the L1 cache.
1229       break;
1230     case SIAtomicScope::WORKGROUP:
1231     case SIAtomicScope::WAVEFRONT:
1232     case SIAtomicScope::SINGLETHREAD:
1233       // No cache to bypass. Store atomics implicitly write through the L1
1234       // cache.
1235       break;
1236     default:
1237       llvm_unreachable("Unsupported synchronization scope");
1238     }
1239   }
1240 
1241   /// The scratch address space does not need the global memory caches
1242   /// to be bypassed as all memory operations by the same thread are
1243   /// sequentially consistent, and no other thread can access scratch
1244   /// memory.
1245 
1246   /// Other address spaces do not have a cache.
1247 
1248   return Changed;
1249 }
1250 
1251 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1252     const MachineBasicBlock::iterator &MI,
1253     SIAtomicScope Scope,
1254     SIAtomicAddrSpace AddrSpace) const {
1255   assert(MI->mayLoad() && MI->mayStore());
1256   bool Changed = false;
1257 
1258   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1259     switch (Scope) {
1260     case SIAtomicScope::SYSTEM:
1261     case SIAtomicScope::AGENT:
1262       /// Do not set glc for RMW atomic operations as they implicitly bypass
1263       /// the L1 cache, and the glc bit is instead used to indicate if they are
1264       /// return or no-return.
1265       break;
1266     case SIAtomicScope::WORKGROUP:
1267     case SIAtomicScope::WAVEFRONT:
1268     case SIAtomicScope::SINGLETHREAD:
1269       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1270       break;
1271     default:
1272       llvm_unreachable("Unsupported synchronization scope");
1273     }
1274   }
1275 
1276   return Changed;
1277 }
1278 
1279 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1280     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1281     bool IsVolatile, bool IsNonTemporal) const {
1282   // Only handle load and store, not atomic read-modify-write insructions. The
1283   // latter use glc to indicate if the atomic returns a result and so must not
1284   // be used for cache control.
1285   assert(MI->mayLoad() ^ MI->mayStore());
1286 
1287   // Only update load and store, not LLVM IR atomic read-modify-write
1288   // instructions. The latter are always marked as volatile so cannot sensibly
1289   // handle it as do not want to pessimize all atomics. Also they do not support
1290   // the nontemporal attribute.
1291   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1292 
1293   bool Changed = false;
1294 
1295   if (IsVolatile) {
1296     // Set L1 cache policy to be MISS_EVICT for load instructions
1297     // and MISS_LRU for store instructions.
1298     // Note: there is no L2 cache bypass policy at the ISA level.
1299     if (Op == SIMemOp::LOAD)
1300       Changed |= enableGLCBit(MI);
1301 
1302     // Ensure operation has completed at system scope to cause all volatile
1303     // operations to be visible outside the program in a global order. Do not
1304     // request cross address space as only the global address space can be
1305     // observable outside the program, so no need to cause a waitcnt for LDS
1306     // address space operations.
1307     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1308                           Position::AFTER);
1309 
1310     return Changed;
1311   }
1312 
1313   if (IsNonTemporal) {
1314     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1315     // for both loads and stores, and the L2 cache policy to STREAM.
1316     Changed |= enableGLCBit(MI);
1317     Changed |= enableSLCBit(MI);
1318     return Changed;
1319   }
1320 
1321   return Changed;
1322 }
1323 
1324 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1325                                       SIAtomicScope Scope,
1326                                       SIAtomicAddrSpace AddrSpace,
1327                                       SIMemOp Op,
1328                                       bool IsCrossAddrSpaceOrdering,
1329                                       Position Pos) const {
1330   if (ST.isTgSplitEnabled()) {
1331     // In threadgroup split mode the waves of a work-group can be executing on
1332     // different CUs. Therefore need to wait for global or GDS memory operations
1333     // to complete to ensure they are visible to waves in the other CUs.
1334     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1335     // the same CU, so no need to wait for global memory as all waves in the
1336     // work-group access the same the L1, nor wait for GDS as access are ordered
1337     // on a CU.
1338     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1339                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1340         (Scope == SIAtomicScope::WORKGROUP)) {
1341       // Same as GFX7 using agent scope.
1342       Scope = SIAtomicScope::AGENT;
1343     }
1344     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1345     // LDS memory operations.
1346     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1347   }
1348   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1349                                         IsCrossAddrSpaceOrdering, Pos);
1350 }
1351 
1352 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1353                                          SIAtomicScope Scope,
1354                                          SIAtomicAddrSpace AddrSpace,
1355                                          Position Pos) const {
1356   if (!InsertCacheInv)
1357     return false;
1358 
1359   bool Changed = false;
1360 
1361   MachineBasicBlock &MBB = *MI->getParent();
1362   DebugLoc DL = MI->getDebugLoc();
1363 
1364   if (Pos == Position::AFTER)
1365     ++MI;
1366 
1367   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1368     switch (Scope) {
1369     case SIAtomicScope::SYSTEM:
1370       // Ensures that following loads will not see stale remote VMEM data or
1371       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1372       // CC will never be stale due to the local memory probes.
1373       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1374       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1375       // hardware does not reorder memory operations by the same wave with
1376       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1377       // remove any cache lines of earlier writes by the same wave and ensures
1378       // later reads by the same wave will refetch the cache lines.
1379       Changed = true;
1380       break;
1381     case SIAtomicScope::AGENT:
1382       // Same as GFX7.
1383       break;
1384     case SIAtomicScope::WORKGROUP:
1385       // In threadgroup split mode the waves of a work-group can be executing on
1386       // different CUs. Therefore need to invalidate the L1 which is per CU.
1387       // Otherwise in non-threadgroup split mode all waves of a work-group are
1388       // on the same CU, and so the L1 does not need to be invalidated.
1389       if (ST.isTgSplitEnabled()) {
1390         // Same as GFX7 using agent scope.
1391         Scope = SIAtomicScope::AGENT;
1392       }
1393       break;
1394     case SIAtomicScope::WAVEFRONT:
1395     case SIAtomicScope::SINGLETHREAD:
1396       // Same as GFX7.
1397       break;
1398     default:
1399       llvm_unreachable("Unsupported synchronization scope");
1400     }
1401   }
1402 
1403   /// The scratch address space does not need the global memory cache
1404   /// to be flushed as all memory operations by the same thread are
1405   /// sequentially consistent, and no other thread can access scratch
1406   /// memory.
1407 
1408   /// Other address spaces do not have a cache.
1409 
1410   if (Pos == Position::AFTER)
1411     --MI;
1412 
1413   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1414 
1415   return Changed;
1416 }
1417 
1418 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1419                                          SIAtomicScope Scope,
1420                                          SIAtomicAddrSpace AddrSpace,
1421                                          bool IsCrossAddrSpaceOrdering,
1422                                          Position Pos) const {
1423   bool Changed = false;
1424 
1425   MachineBasicBlock &MBB = *MI->getParent();
1426   DebugLoc DL = MI->getDebugLoc();
1427 
1428   if (Pos == Position::AFTER)
1429     ++MI;
1430 
1431   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1432     switch (Scope) {
1433     case SIAtomicScope::SYSTEM:
1434       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1435       // hardware does not reorder memory operations by the same wave with
1436       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1437       // to initiate writeback of any dirty cache lines of earlier writes by the
1438       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1439       // writeback has completed.
1440       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1441         // Set SC bits to indicate system scope.
1442         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1443       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1444       // vmcnt(0)" needed by the "BUFFER_WBL2".
1445       Changed = true;
1446       break;
1447     case SIAtomicScope::AGENT:
1448     case SIAtomicScope::WORKGROUP:
1449     case SIAtomicScope::WAVEFRONT:
1450     case SIAtomicScope::SINGLETHREAD:
1451       // Same as GFX7.
1452       break;
1453     default:
1454       llvm_unreachable("Unsupported synchronization scope");
1455     }
1456   }
1457 
1458   if (Pos == Position::AFTER)
1459     --MI;
1460 
1461   Changed |=
1462       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1463                                         IsCrossAddrSpaceOrdering, Pos);
1464 
1465   return Changed;
1466 }
1467 
1468 bool SIGfx940CacheControl::enableLoadCacheBypass(
1469     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1470     SIAtomicAddrSpace AddrSpace) const {
1471   assert(MI->mayLoad() && !MI->mayStore());
1472   bool Changed = false;
1473 
1474   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1475     switch (Scope) {
1476     case SIAtomicScope::SYSTEM:
1477       // Set SC bits to indicate system scope.
1478       Changed |= enableSC0Bit(MI);
1479       Changed |= enableSC1Bit(MI);
1480       break;
1481     case SIAtomicScope::AGENT:
1482       // Set SC bits to indicate agent scope.
1483       Changed |= enableSC1Bit(MI);
1484       break;
1485     case SIAtomicScope::WORKGROUP:
1486       // In threadgroup split mode the waves of a work-group can be executing on
1487       // different CUs. Therefore need to bypass the L1 which is per CU.
1488       // Otherwise in non-threadgroup split mode all waves of a work-group are
1489       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1490       // bits to indicate work-group scope will do this automatically.
1491       Changed |= enableSC0Bit(MI);
1492       break;
1493     case SIAtomicScope::WAVEFRONT:
1494     case SIAtomicScope::SINGLETHREAD:
1495       // Leave SC bits unset to indicate wavefront scope.
1496       break;
1497     default:
1498       llvm_unreachable("Unsupported synchronization scope");
1499     }
1500   }
1501 
1502   /// The scratch address space does not need the global memory caches
1503   /// to be bypassed as all memory operations by the same thread are
1504   /// sequentially consistent, and no other thread can access scratch
1505   /// memory.
1506 
1507   /// Other address spaces do not have a cache.
1508 
1509   return Changed;
1510 }
1511 
1512 bool SIGfx940CacheControl::enableStoreCacheBypass(
1513     const MachineBasicBlock::iterator &MI,
1514     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1515   assert(!MI->mayLoad() && MI->mayStore());
1516   bool Changed = false;
1517 
1518   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1519     switch (Scope) {
1520     case SIAtomicScope::SYSTEM:
1521       // Set SC bits to indicate system scope.
1522       Changed |= enableSC0Bit(MI);
1523       Changed |= enableSC1Bit(MI);
1524       break;
1525     case SIAtomicScope::AGENT:
1526       // Set SC bits to indicate agent scope.
1527       Changed |= enableSC1Bit(MI);
1528       break;
1529     case SIAtomicScope::WORKGROUP:
1530       // Set SC bits to indicate workgroup scope.
1531       Changed |= enableSC0Bit(MI);
1532       break;
1533     case SIAtomicScope::WAVEFRONT:
1534     case SIAtomicScope::SINGLETHREAD:
1535       // Leave SC bits unset to indicate wavefront scope.
1536       break;
1537     default:
1538       llvm_unreachable("Unsupported synchronization scope");
1539     }
1540   }
1541 
1542   /// The scratch address space does not need the global memory caches
1543   /// to be bypassed as all memory operations by the same thread are
1544   /// sequentially consistent, and no other thread can access scratch
1545   /// memory.
1546 
1547   /// Other address spaces do not have a cache.
1548 
1549   return Changed;
1550 }
1551 
1552 bool SIGfx940CacheControl::enableRMWCacheBypass(
1553     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1554     SIAtomicAddrSpace AddrSpace) const {
1555   assert(MI->mayLoad() && MI->mayStore());
1556   bool Changed = false;
1557 
1558   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1559     switch (Scope) {
1560     case SIAtomicScope::SYSTEM:
1561       // Set SC1 bit to indicate system scope.
1562       Changed |= enableSC1Bit(MI);
1563       break;
1564     case SIAtomicScope::AGENT:
1565     case SIAtomicScope::WORKGROUP:
1566     case SIAtomicScope::WAVEFRONT:
1567     case SIAtomicScope::SINGLETHREAD:
1568       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1569       // to indicate system or agent scope. The SC0 bit is used to indicate if
1570       // they are return or no-return. Leave SC1 bit unset to indicate agent
1571       // scope.
1572       break;
1573     default:
1574       llvm_unreachable("Unsupported synchronization scope");
1575     }
1576   }
1577 
1578   return Changed;
1579 }
1580 
1581 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1582     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1583     bool IsVolatile, bool IsNonTemporal) const {
1584   // Only handle load and store, not atomic read-modify-write insructions. The
1585   // latter use glc to indicate if the atomic returns a result and so must not
1586   // be used for cache control.
1587   assert(MI->mayLoad() ^ MI->mayStore());
1588 
1589   // Only update load and store, not LLVM IR atomic read-modify-write
1590   // instructions. The latter are always marked as volatile so cannot sensibly
1591   // handle it as do not want to pessimize all atomics. Also they do not support
1592   // the nontemporal attribute.
1593   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1594 
1595   bool Changed = false;
1596 
1597   if (IsVolatile) {
1598     // Set SC bits to indicate system scope.
1599     Changed |= enableSC0Bit(MI);
1600     Changed |= enableSC1Bit(MI);
1601 
1602     // Ensure operation has completed at system scope to cause all volatile
1603     // operations to be visible outside the program in a global order. Do not
1604     // request cross address space as only the global address space can be
1605     // observable outside the program, so no need to cause a waitcnt for LDS
1606     // address space operations.
1607     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1608                           Position::AFTER);
1609 
1610     return Changed;
1611   }
1612 
1613   if (IsNonTemporal) {
1614     Changed |= enableNTBit(MI);
1615     return Changed;
1616   }
1617 
1618   return Changed;
1619 }
1620 
1621 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1622                                          SIAtomicScope Scope,
1623                                          SIAtomicAddrSpace AddrSpace,
1624                                          Position Pos) const {
1625   if (!InsertCacheInv)
1626     return false;
1627 
1628   bool Changed = false;
1629 
1630   MachineBasicBlock &MBB = *MI->getParent();
1631   DebugLoc DL = MI->getDebugLoc();
1632 
1633   if (Pos == Position::AFTER)
1634     ++MI;
1635 
1636   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1637     switch (Scope) {
1638     case SIAtomicScope::SYSTEM:
1639       // Ensures that following loads will not see stale remote VMEM data or
1640       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1641       // CC will never be stale due to the local memory probes.
1642       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1643           // Set SC bits to indicate system scope.
1644           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1645       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1646       // hardware does not reorder memory operations by the same wave with
1647       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1648       // remove any cache lines of earlier writes by the same wave and ensures
1649       // later reads by the same wave will refetch the cache lines.
1650       Changed = true;
1651       break;
1652     case SIAtomicScope::AGENT:
1653       // Ensures that following loads will not see stale remote date or local
1654       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1655       // due to the memory probes.
1656       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1657           // Set SC bits to indicate agent scope.
1658           .addImm(AMDGPU::CPol::SC1);
1659       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1660       // does not reorder memory operations with respect to preceeding buffer
1661       // invalidate. The invalidate is guaranteed to remove any cache lines of
1662       // earlier writes and ensures later writes will refetch the cache lines.
1663       Changed = true;
1664       break;
1665     case SIAtomicScope::WORKGROUP:
1666       // In threadgroup split mode the waves of a work-group can be executing on
1667       // different CUs. Therefore need to invalidate the L1 which is per CU.
1668       // Otherwise in non-threadgroup split mode all waves of a work-group are
1669       // on the same CU, and so the L1 does not need to be invalidated.
1670       if (ST.isTgSplitEnabled()) {
1671         // Ensures L1 is invalidated if in threadgroup split mode. In
1672         // non-threadgroup split mode it is a NOP, but no point generating it in
1673         // that case if know not in that mode.
1674         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1675             // Set SC bits to indicate work-group scope.
1676             .addImm(AMDGPU::CPol::SC0);
1677         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1678         // does not reorder memory operations with respect to preceeding buffer
1679         // invalidate. The invalidate is guaranteed to remove any cache lines of
1680         // earlier writes and ensures later writes will refetch the cache lines.
1681         Changed = true;
1682       }
1683       break;
1684     case SIAtomicScope::WAVEFRONT:
1685     case SIAtomicScope::SINGLETHREAD:
1686       // Could generate "BUFFER_INV" but it would do nothing as there are no
1687       // caches to invalidate.
1688       break;
1689     default:
1690       llvm_unreachable("Unsupported synchronization scope");
1691     }
1692   }
1693 
1694   /// The scratch address space does not need the global memory cache
1695   /// to be flushed as all memory operations by the same thread are
1696   /// sequentially consistent, and no other thread can access scratch
1697   /// memory.
1698 
1699   /// Other address spaces do not have a cache.
1700 
1701   if (Pos == Position::AFTER)
1702     --MI;
1703 
1704   return Changed;
1705 }
1706 
1707 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1708                                          SIAtomicScope Scope,
1709                                          SIAtomicAddrSpace AddrSpace,
1710                                          bool IsCrossAddrSpaceOrdering,
1711                                          Position Pos) const {
1712   bool Changed = false;
1713 
1714   MachineBasicBlock &MBB = *MI->getParent();
1715   DebugLoc DL = MI->getDebugLoc();
1716 
1717   if (Pos == Position::AFTER)
1718     ++MI;
1719 
1720   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1721     switch (Scope) {
1722     case SIAtomicScope::SYSTEM:
1723       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1724       // hardware does not reorder memory operations by the same wave with
1725       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1726       // to initiate writeback of any dirty cache lines of earlier writes by the
1727       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1728       // writeback has completed.
1729       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1730           // Set SC bits to indicate system scope.
1731           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1732       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1733       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1734       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1735       Changed = true;
1736       break;
1737     case SIAtomicScope::AGENT:
1738       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1739           // Set SC bits to indicate agent scope.
1740           .addImm(AMDGPU::CPol::SC1);
1741 
1742       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1743       // SIAtomicScope::AGENT, the following insertWait will generate the
1744       // required "S_WAITCNT vmcnt(0)".
1745       Changed = true;
1746       break;
1747     case SIAtomicScope::WORKGROUP:
1748     case SIAtomicScope::WAVEFRONT:
1749     case SIAtomicScope::SINGLETHREAD:
1750       // Do not generate "BUFFER_WBL2" as there are no caches it would
1751       // writeback, and would require an otherwise unnecessary
1752       // "S_WAITCNT vmcnt(0)".
1753       break;
1754     default:
1755       llvm_unreachable("Unsupported synchronization scope");
1756     }
1757   }
1758 
1759   if (Pos == Position::AFTER)
1760     --MI;
1761 
1762   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1763   // S_WAITCNT needed.
1764   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1765                         IsCrossAddrSpaceOrdering, Pos);
1766 
1767   return Changed;
1768 }
1769 
1770 bool SIGfx10CacheControl::enableLoadCacheBypass(
1771     const MachineBasicBlock::iterator &MI,
1772     SIAtomicScope Scope,
1773     SIAtomicAddrSpace AddrSpace) const {
1774   assert(MI->mayLoad() && !MI->mayStore());
1775   bool Changed = false;
1776 
1777   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1778     switch (Scope) {
1779     case SIAtomicScope::SYSTEM:
1780     case SIAtomicScope::AGENT:
1781       // Set the L0 and L1 cache policies to MISS_EVICT.
1782       // Note: there is no L2 cache coherent bypass control at the ISA level.
1783       Changed |= enableGLCBit(MI);
1784       Changed |= enableDLCBit(MI);
1785       break;
1786     case SIAtomicScope::WORKGROUP:
1787       // In WGP mode the waves of a work-group can be executing on either CU of
1788       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1789       // CU mode all waves of a work-group are on the same CU, and so the L0
1790       // does not need to be bypassed.
1791       if (!ST.isCuModeEnabled())
1792         Changed |= enableGLCBit(MI);
1793       break;
1794     case SIAtomicScope::WAVEFRONT:
1795     case SIAtomicScope::SINGLETHREAD:
1796       // No cache to bypass.
1797       break;
1798     default:
1799       llvm_unreachable("Unsupported synchronization scope");
1800     }
1801   }
1802 
1803   /// The scratch address space does not need the global memory caches
1804   /// to be bypassed as all memory operations by the same thread are
1805   /// sequentially consistent, and no other thread can access scratch
1806   /// memory.
1807 
1808   /// Other address spaces do not have a cache.
1809 
1810   return Changed;
1811 }
1812 
1813 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1814     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1815     bool IsVolatile, bool IsNonTemporal) const {
1816 
1817   // Only handle load and store, not atomic read-modify-write insructions. The
1818   // latter use glc to indicate if the atomic returns a result and so must not
1819   // be used for cache control.
1820   assert(MI->mayLoad() ^ MI->mayStore());
1821 
1822   // Only update load and store, not LLVM IR atomic read-modify-write
1823   // instructions. The latter are always marked as volatile so cannot sensibly
1824   // handle it as do not want to pessimize all atomics. Also they do not support
1825   // the nontemporal attribute.
1826   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1827 
1828   bool Changed = false;
1829 
1830   if (IsVolatile) {
1831     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1832     // and MISS_LRU for store instructions.
1833     // Note: there is no L2 cache coherent bypass control at the ISA level.
1834     if (Op == SIMemOp::LOAD) {
1835       Changed |= enableGLCBit(MI);
1836       Changed |= enableDLCBit(MI);
1837     }
1838 
1839     // Ensure operation has completed at system scope to cause all volatile
1840     // operations to be visible outside the program in a global order. Do not
1841     // request cross address space as only the global address space can be
1842     // observable outside the program, so no need to cause a waitcnt for LDS
1843     // address space operations.
1844     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1845                           Position::AFTER);
1846     return Changed;
1847   }
1848 
1849   if (IsNonTemporal) {
1850     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1851     // and L2 cache policy to STREAM.
1852     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1853     // to MISS_EVICT and the L2 cache policy to STREAM.
1854     if (Op == SIMemOp::STORE)
1855       Changed |= enableGLCBit(MI);
1856     Changed |= enableSLCBit(MI);
1857 
1858     return Changed;
1859   }
1860 
1861   return Changed;
1862 }
1863 
1864 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1865                                      SIAtomicScope Scope,
1866                                      SIAtomicAddrSpace AddrSpace,
1867                                      SIMemOp Op,
1868                                      bool IsCrossAddrSpaceOrdering,
1869                                      Position Pos) const {
1870   bool Changed = false;
1871 
1872   MachineBasicBlock &MBB = *MI->getParent();
1873   DebugLoc DL = MI->getDebugLoc();
1874 
1875   if (Pos == Position::AFTER)
1876     ++MI;
1877 
1878   bool VMCnt = false;
1879   bool VSCnt = false;
1880   bool LGKMCnt = false;
1881 
1882   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1883       SIAtomicAddrSpace::NONE) {
1884     switch (Scope) {
1885     case SIAtomicScope::SYSTEM:
1886     case SIAtomicScope::AGENT:
1887       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1888         VMCnt |= true;
1889       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1890         VSCnt |= true;
1891       break;
1892     case SIAtomicScope::WORKGROUP:
1893       // In WGP mode the waves of a work-group can be executing on either CU of
1894       // the WGP. Therefore need to wait for operations to complete to ensure
1895       // they are visible to waves in the other CU as the L0 is per CU.
1896       // Otherwise in CU mode and all waves of a work-group are on the same CU
1897       // which shares the same L0.
1898       if (!ST.isCuModeEnabled()) {
1899         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1900           VMCnt |= true;
1901         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1902           VSCnt |= true;
1903       }
1904       break;
1905     case SIAtomicScope::WAVEFRONT:
1906     case SIAtomicScope::SINGLETHREAD:
1907       // The L0 cache keeps all memory operations in order for
1908       // work-items in the same wavefront.
1909       break;
1910     default:
1911       llvm_unreachable("Unsupported synchronization scope");
1912     }
1913   }
1914 
1915   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1916     switch (Scope) {
1917     case SIAtomicScope::SYSTEM:
1918     case SIAtomicScope::AGENT:
1919     case SIAtomicScope::WORKGROUP:
1920       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1921       // not needed as LDS operations for all waves are executed in a total
1922       // global ordering as observed by all waves. Required if also
1923       // synchronizing with global/GDS memory as LDS operations could be
1924       // reordered with respect to later global/GDS memory operations of the
1925       // same wave.
1926       LGKMCnt |= IsCrossAddrSpaceOrdering;
1927       break;
1928     case SIAtomicScope::WAVEFRONT:
1929     case SIAtomicScope::SINGLETHREAD:
1930       // The LDS keeps all memory operations in order for
1931       // the same wavefront.
1932       break;
1933     default:
1934       llvm_unreachable("Unsupported synchronization scope");
1935     }
1936   }
1937 
1938   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1939     switch (Scope) {
1940     case SIAtomicScope::SYSTEM:
1941     case SIAtomicScope::AGENT:
1942       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1943       // is not needed as GDS operations for all waves are executed in a total
1944       // global ordering as observed by all waves. Required if also
1945       // synchronizing with global/LDS memory as GDS operations could be
1946       // reordered with respect to later global/LDS memory operations of the
1947       // same wave.
1948       LGKMCnt |= IsCrossAddrSpaceOrdering;
1949       break;
1950     case SIAtomicScope::WORKGROUP:
1951     case SIAtomicScope::WAVEFRONT:
1952     case SIAtomicScope::SINGLETHREAD:
1953       // The GDS keeps all memory operations in order for
1954       // the same work-group.
1955       break;
1956     default:
1957       llvm_unreachable("Unsupported synchronization scope");
1958     }
1959   }
1960 
1961   if (VMCnt || LGKMCnt) {
1962     unsigned WaitCntImmediate =
1963       AMDGPU::encodeWaitcnt(IV,
1964                             VMCnt ? 0 : getVmcntBitMask(IV),
1965                             getExpcntBitMask(IV),
1966                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1967     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1968         .addImm(WaitCntImmediate);
1969     Changed = true;
1970   }
1971 
1972   if (VSCnt) {
1973     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1974         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1975         .addImm(0);
1976     Changed = true;
1977   }
1978 
1979   if (Pos == Position::AFTER)
1980     --MI;
1981 
1982   return Changed;
1983 }
1984 
1985 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1986                                         SIAtomicScope Scope,
1987                                         SIAtomicAddrSpace AddrSpace,
1988                                         Position Pos) const {
1989   if (!InsertCacheInv)
1990     return false;
1991 
1992   bool Changed = false;
1993 
1994   MachineBasicBlock &MBB = *MI->getParent();
1995   DebugLoc DL = MI->getDebugLoc();
1996 
1997   if (Pos == Position::AFTER)
1998     ++MI;
1999 
2000   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2001     switch (Scope) {
2002     case SIAtomicScope::SYSTEM:
2003     case SIAtomicScope::AGENT:
2004       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2005       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2006       Changed = true;
2007       break;
2008     case SIAtomicScope::WORKGROUP:
2009       // In WGP mode the waves of a work-group can be executing on either CU of
2010       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2011       // in CU mode and all waves of a work-group are on the same CU, and so the
2012       // L0 does not need to be invalidated.
2013       if (!ST.isCuModeEnabled()) {
2014         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2015         Changed = true;
2016       }
2017       break;
2018     case SIAtomicScope::WAVEFRONT:
2019     case SIAtomicScope::SINGLETHREAD:
2020       // No cache to invalidate.
2021       break;
2022     default:
2023       llvm_unreachable("Unsupported synchronization scope");
2024     }
2025   }
2026 
2027   /// The scratch address space does not need the global memory cache
2028   /// to be flushed as all memory operations by the same thread are
2029   /// sequentially consistent, and no other thread can access scratch
2030   /// memory.
2031 
2032   /// Other address spaces do not have a cache.
2033 
2034   if (Pos == Position::AFTER)
2035     --MI;
2036 
2037   return Changed;
2038 }
2039 
2040 bool SIGfx11CacheControl::enableLoadCacheBypass(
2041     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2042     SIAtomicAddrSpace AddrSpace) const {
2043   assert(MI->mayLoad() && !MI->mayStore());
2044   bool Changed = false;
2045 
2046   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2047     switch (Scope) {
2048     case SIAtomicScope::SYSTEM:
2049     case SIAtomicScope::AGENT:
2050       // Set the L0 and L1 cache policies to MISS_EVICT.
2051       // Note: there is no L2 cache coherent bypass control at the ISA level.
2052       Changed |= enableGLCBit(MI);
2053       break;
2054     case SIAtomicScope::WORKGROUP:
2055       // In WGP mode the waves of a work-group can be executing on either CU of
2056       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2057       // CU mode all waves of a work-group are on the same CU, and so the L0
2058       // does not need to be bypassed.
2059       if (!ST.isCuModeEnabled())
2060         Changed |= enableGLCBit(MI);
2061       break;
2062     case SIAtomicScope::WAVEFRONT:
2063     case SIAtomicScope::SINGLETHREAD:
2064       // No cache to bypass.
2065       break;
2066     default:
2067       llvm_unreachable("Unsupported synchronization scope");
2068     }
2069   }
2070 
2071   /// The scratch address space does not need the global memory caches
2072   /// to be bypassed as all memory operations by the same thread are
2073   /// sequentially consistent, and no other thread can access scratch
2074   /// memory.
2075 
2076   /// Other address spaces do not have a cache.
2077 
2078   return Changed;
2079 }
2080 
2081 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2082     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2083     bool IsVolatile, bool IsNonTemporal) const {
2084 
2085   // Only handle load and store, not atomic read-modify-write insructions. The
2086   // latter use glc to indicate if the atomic returns a result and so must not
2087   // be used for cache control.
2088   assert(MI->mayLoad() ^ MI->mayStore());
2089 
2090   // Only update load and store, not LLVM IR atomic read-modify-write
2091   // instructions. The latter are always marked as volatile so cannot sensibly
2092   // handle it as do not want to pessimize all atomics. Also they do not support
2093   // the nontemporal attribute.
2094   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2095 
2096   bool Changed = false;
2097 
2098   if (IsVolatile) {
2099     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2100     // and MISS_LRU for store instructions.
2101     // Note: there is no L2 cache coherent bypass control at the ISA level.
2102     if (Op == SIMemOp::LOAD)
2103       Changed |= enableGLCBit(MI);
2104 
2105     // Set MALL NOALLOC for load and store instructions.
2106     Changed |= enableDLCBit(MI);
2107 
2108     // Ensure operation has completed at system scope to cause all volatile
2109     // operations to be visible outside the program in a global order. Do not
2110     // request cross address space as only the global address space can be
2111     // observable outside the program, so no need to cause a waitcnt for LDS
2112     // address space operations.
2113     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2114                           Position::AFTER);
2115     return Changed;
2116   }
2117 
2118   if (IsNonTemporal) {
2119     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2120     // and L2 cache policy to STREAM.
2121     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2122     // to MISS_EVICT and the L2 cache policy to STREAM.
2123     if (Op == SIMemOp::STORE)
2124       Changed |= enableGLCBit(MI);
2125     Changed |= enableSLCBit(MI);
2126 
2127     // Set MALL NOALLOC for load and store instructions.
2128     Changed |= enableDLCBit(MI);
2129     return Changed;
2130   }
2131 
2132   return Changed;
2133 }
2134 
2135 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2136   if (AtomicPseudoMIs.empty())
2137     return false;
2138 
2139   for (auto &MI : AtomicPseudoMIs)
2140     MI->eraseFromParent();
2141 
2142   AtomicPseudoMIs.clear();
2143   return true;
2144 }
2145 
2146 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2147                                    MachineBasicBlock::iterator &MI) {
2148   assert(MI->mayLoad() && !MI->mayStore());
2149 
2150   bool Changed = false;
2151 
2152   if (MOI.isAtomic()) {
2153     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2154         MOI.getOrdering() == AtomicOrdering::Acquire ||
2155         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2156       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2157                                            MOI.getOrderingAddrSpace());
2158     }
2159 
2160     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2161       Changed |= CC->insertWait(MI, MOI.getScope(),
2162                                 MOI.getOrderingAddrSpace(),
2163                                 SIMemOp::LOAD | SIMemOp::STORE,
2164                                 MOI.getIsCrossAddressSpaceOrdering(),
2165                                 Position::BEFORE);
2166 
2167     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2168         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2169       Changed |= CC->insertWait(MI, MOI.getScope(),
2170                                 MOI.getInstrAddrSpace(),
2171                                 SIMemOp::LOAD,
2172                                 MOI.getIsCrossAddressSpaceOrdering(),
2173                                 Position::AFTER);
2174       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2175                                    MOI.getOrderingAddrSpace(),
2176                                    Position::AFTER);
2177     }
2178 
2179     return Changed;
2180   }
2181 
2182   // Atomic instructions already bypass caches to the scope specified by the
2183   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2184   // need additional treatment.
2185   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2186                                                 SIMemOp::LOAD, MOI.isVolatile(),
2187                                                 MOI.isNonTemporal());
2188   return Changed;
2189 }
2190 
2191 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2192                                     MachineBasicBlock::iterator &MI) {
2193   assert(!MI->mayLoad() && MI->mayStore());
2194 
2195   bool Changed = false;
2196 
2197   if (MOI.isAtomic()) {
2198     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2199         MOI.getOrdering() == AtomicOrdering::Release ||
2200         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2201       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2202                                             MOI.getOrderingAddrSpace());
2203     }
2204 
2205     if (MOI.getOrdering() == AtomicOrdering::Release ||
2206         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2207       Changed |= CC->insertRelease(MI, MOI.getScope(),
2208                                    MOI.getOrderingAddrSpace(),
2209                                    MOI.getIsCrossAddressSpaceOrdering(),
2210                                    Position::BEFORE);
2211 
2212     return Changed;
2213   }
2214 
2215   // Atomic instructions already bypass caches to the scope specified by the
2216   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2217   // need additional treatment.
2218   Changed |= CC->enableVolatileAndOrNonTemporal(
2219       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2220       MOI.isNonTemporal());
2221   return Changed;
2222 }
2223 
2224 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2225                                           MachineBasicBlock::iterator &MI) {
2226   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2227 
2228   AtomicPseudoMIs.push_back(MI);
2229   bool Changed = false;
2230 
2231   if (MOI.isAtomic()) {
2232     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2233       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2234                                 SIMemOp::LOAD | SIMemOp::STORE,
2235                                 MOI.getIsCrossAddressSpaceOrdering(),
2236                                 Position::BEFORE);
2237 
2238     if (MOI.getOrdering() == AtomicOrdering::Release ||
2239         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2240         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2241       /// TODO: This relies on a barrier always generating a waitcnt
2242       /// for LDS to ensure it is not reordered with the completion of
2243       /// the proceeding LDS operations. If barrier had a memory
2244       /// ordering and memory scope, then library does not need to
2245       /// generate a fence. Could add support in this file for
2246       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2247       /// adding S_WAITCNT before a S_BARRIER.
2248       Changed |= CC->insertRelease(MI, MOI.getScope(),
2249                                    MOI.getOrderingAddrSpace(),
2250                                    MOI.getIsCrossAddressSpaceOrdering(),
2251                                    Position::BEFORE);
2252 
2253     // TODO: If both release and invalidate are happening they could be combined
2254     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2255     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2256     // track cache invalidate and write back instructions.
2257 
2258     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2259         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2260         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2261       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2262                                    MOI.getOrderingAddrSpace(),
2263                                    Position::BEFORE);
2264 
2265     return Changed;
2266   }
2267 
2268   return Changed;
2269 }
2270 
2271 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2272   MachineBasicBlock::iterator &MI) {
2273   assert(MI->mayLoad() && MI->mayStore());
2274 
2275   bool Changed = false;
2276 
2277   if (MOI.isAtomic()) {
2278     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2279         MOI.getOrdering() == AtomicOrdering::Acquire ||
2280         MOI.getOrdering() == AtomicOrdering::Release ||
2281         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2282         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2283       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2284                                           MOI.getInstrAddrSpace());
2285     }
2286 
2287     if (MOI.getOrdering() == AtomicOrdering::Release ||
2288         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2289         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2290         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2291       Changed |= CC->insertRelease(MI, MOI.getScope(),
2292                                    MOI.getOrderingAddrSpace(),
2293                                    MOI.getIsCrossAddressSpaceOrdering(),
2294                                    Position::BEFORE);
2295 
2296     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2297         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2298         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2299         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2300         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2301       Changed |= CC->insertWait(MI, MOI.getScope(),
2302                                 MOI.getInstrAddrSpace(),
2303                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2304                                                    SIMemOp::STORE,
2305                                 MOI.getIsCrossAddressSpaceOrdering(),
2306                                 Position::AFTER);
2307       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2308                                    MOI.getOrderingAddrSpace(),
2309                                    Position::AFTER);
2310     }
2311 
2312     return Changed;
2313   }
2314 
2315   return Changed;
2316 }
2317 
2318 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2319   bool Changed = false;
2320 
2321   SIMemOpAccess MOA(MF);
2322   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2323 
2324   for (auto &MBB : MF) {
2325     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2326 
2327       // Unbundle instructions after the post-RA scheduler.
2328       if (MI->isBundle() && MI->mayLoadOrStore()) {
2329         MachineBasicBlock::instr_iterator II(MI->getIterator());
2330         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2331              I != E && I->isBundledWithPred(); ++I) {
2332           I->unbundleFromPred();
2333           for (MachineOperand &MO : I->operands())
2334             if (MO.isReg())
2335               MO.setIsInternalRead(false);
2336         }
2337 
2338         MI->eraseFromParent();
2339         MI = II->getIterator();
2340       }
2341 
2342       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2343         continue;
2344 
2345       if (const auto &MOI = MOA.getLoadInfo(MI))
2346         Changed |= expandLoad(*MOI, MI);
2347       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2348         Changed |= expandStore(*MOI, MI);
2349         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2350       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2351         Changed |= expandAtomicFence(*MOI, MI);
2352       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2353         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2354     }
2355   }
2356 
2357   Changed |= removeAtomicPseudoMIs();
2358   return Changed;
2359 }
2360 
2361 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2362 
2363 char SIMemoryLegalizer::ID = 0;
2364 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2365 
2366 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2367   return new SIMemoryLegalizer();
2368 }
2369