1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37 namespace {
38
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52 BEFORE,
53 AFTER
54 };
55
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
61 WORKGROUP,
62 AGENT,
63 SYSTEM
64 };
65
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87
88 class SIMemOpInfo final {
89 private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106 bool IsCrossAddressSpaceOrdering = true,
107 AtomicOrdering FailureOrdering =
108 AtomicOrdering::SequentiallyConsistent,
109 bool IsVolatile = false,
110 bool IsNonTemporal = false)
111 : Ordering(Ordering), FailureOrdering(FailureOrdering),
112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113 InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115 IsVolatile(IsVolatile),
116 IsNonTemporal(IsNonTemporal) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155 public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
getScope() const158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
getOrdering() const164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
getFailureOrdering() const170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SIMemOpInfo.
getInstrAddrSpace() const176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SIMemOpInfo.
getOrderingAddrSpace() const182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210 };
211
212 class SIMemOpAccess final {
213 private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 std::optional<SIMemOpInfo>
233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234
235 public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241 std::optional<SIMemOpInfo>
242 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243
244 /// \returns Store info if \p MI is a store operation, "std::nullopt"
245 /// otherwise.
246 std::optional<SIMemOpInfo>
247 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248
249 /// \returns Atomic fence info if \p MI is an atomic fence operation,
250 /// "std::nullopt" otherwise.
251 std::optional<SIMemOpInfo>
252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253
254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255 /// rmw operation, "std::nullopt" otherwise.
256 std::optional<SIMemOpInfo>
257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259
260 class SICacheControl {
261 protected:
262
263 /// AMDGPU subtarget info.
264 const GCNSubtarget &ST;
265
266 /// Instruction info.
267 const SIInstrInfo *TII = nullptr;
268
269 IsaVersion IV;
270
271 /// Whether to insert cache invalidating instructions.
272 bool InsertCacheInv;
273
274 SICacheControl(const GCNSubtarget &ST);
275
276 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277 /// \returns Returns true if \p MI is modified, false otherwise.
278 bool enableNamedBit(const MachineBasicBlock::iterator MI,
279 AMDGPU::CPol::CPol Bit) const;
280
281 public:
282
283 /// Create a cache control for the subtarget \p ST.
284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285
286 /// Update \p MI memory load instruction to bypass any caches up to
287 /// the \p Scope memory scope for address spaces \p
288 /// AddrSpace. Return true iff the instruction was modified.
289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290 SIAtomicScope Scope,
291 SIAtomicAddrSpace AddrSpace) const = 0;
292
293 /// Update \p MI memory store instruction to bypass any caches up to
294 /// the \p Scope memory scope for address spaces \p
295 /// AddrSpace. Return true iff the instruction was modified.
296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297 SIAtomicScope Scope,
298 SIAtomicAddrSpace AddrSpace) const = 0;
299
300 /// Update \p MI memory read-modify-write instruction to bypass any caches up
301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302 /// iff the instruction was modified.
303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304 SIAtomicScope Scope,
305 SIAtomicAddrSpace AddrSpace) const = 0;
306
307 /// Update \p MI memory instruction of kind \p Op associated with address
308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309 /// true iff the instruction was modified.
310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311 SIAtomicAddrSpace AddrSpace,
312 SIMemOp Op, bool IsVolatile,
313 bool IsNonTemporal) const = 0;
314
315 /// Inserts any necessary instructions at position \p Pos relative
316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317 /// \p Op associated with address spaces \p AddrSpace have completed. Used
318 /// between memory instructions to enforce the order they become visible as
319 /// observed by other memory instructions executing in memory scope \p Scope.
320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321 /// address spaces. Returns true iff any instructions inserted.
322 virtual bool insertWait(MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace,
325 SIMemOp Op,
326 bool IsCrossAddrSpaceOrdering,
327 Position Pos) const = 0;
328
329 /// Inserts any necessary instructions at position \p Pos relative to
330 /// instruction \p MI to ensure any subsequent memory instructions of this
331 /// thread with address spaces \p AddrSpace will observe the previous memory
332 /// operations by any thread for memory scopes up to memory scope \p Scope .
333 /// Returns true iff any instructions inserted.
334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335 SIAtomicScope Scope,
336 SIAtomicAddrSpace AddrSpace,
337 Position Pos) const = 0;
338
339 /// Inserts any necessary instructions at position \p Pos relative to
340 /// instruction \p MI to ensure previous memory instructions by this thread
341 /// with address spaces \p AddrSpace have completed and can be observed by
342 /// subsequent memory instructions by any thread executing in memory scope \p
343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344 /// between address spaces. Returns true iff any instructions inserted.
345 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 bool IsCrossAddrSpaceOrdering,
349 Position Pos) const = 0;
350
351 /// Virtual destructor to allow derivations to be deleted.
352 virtual ~SICacheControl() = default;
353
354 };
355
356 class SIGfx6CacheControl : public SICacheControl {
357 protected:
358
359 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
360 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const361 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
362 return enableNamedBit(MI, AMDGPU::CPol::GLC);
363 }
364
365 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
366 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const367 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
368 return enableNamedBit(MI, AMDGPU::CPol::SLC);
369 }
370
371 public:
372
SIGfx6CacheControl(const GCNSubtarget & ST)373 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
374
375 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
376 SIAtomicScope Scope,
377 SIAtomicAddrSpace AddrSpace) const override;
378
379 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const override;
382
383 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace) const override;
386
387 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
388 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
389 bool IsVolatile,
390 bool IsNonTemporal) const override;
391
392 bool insertWait(MachineBasicBlock::iterator &MI,
393 SIAtomicScope Scope,
394 SIAtomicAddrSpace AddrSpace,
395 SIMemOp Op,
396 bool IsCrossAddrSpaceOrdering,
397 Position Pos) const override;
398
399 bool insertAcquire(MachineBasicBlock::iterator &MI,
400 SIAtomicScope Scope,
401 SIAtomicAddrSpace AddrSpace,
402 Position Pos) const override;
403
404 bool insertRelease(MachineBasicBlock::iterator &MI,
405 SIAtomicScope Scope,
406 SIAtomicAddrSpace AddrSpace,
407 bool IsCrossAddrSpaceOrdering,
408 Position Pos) const override;
409 };
410
411 class SIGfx7CacheControl : public SIGfx6CacheControl {
412 public:
413
SIGfx7CacheControl(const GCNSubtarget & ST)414 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
415
416 bool insertAcquire(MachineBasicBlock::iterator &MI,
417 SIAtomicScope Scope,
418 SIAtomicAddrSpace AddrSpace,
419 Position Pos) const override;
420
421 };
422
423 class SIGfx90ACacheControl : public SIGfx7CacheControl {
424 public:
425
SIGfx90ACacheControl(const GCNSubtarget & ST)426 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
427
428 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
429 SIAtomicScope Scope,
430 SIAtomicAddrSpace AddrSpace) const override;
431
432 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace) const override;
435
436 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
437 SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace) const override;
439
440 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
441 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
442 bool IsVolatile,
443 bool IsNonTemporal) const override;
444
445 bool insertWait(MachineBasicBlock::iterator &MI,
446 SIAtomicScope Scope,
447 SIAtomicAddrSpace AddrSpace,
448 SIMemOp Op,
449 bool IsCrossAddrSpaceOrdering,
450 Position Pos) const override;
451
452 bool insertAcquire(MachineBasicBlock::iterator &MI,
453 SIAtomicScope Scope,
454 SIAtomicAddrSpace AddrSpace,
455 Position Pos) const override;
456
457 bool insertRelease(MachineBasicBlock::iterator &MI,
458 SIAtomicScope Scope,
459 SIAtomicAddrSpace AddrSpace,
460 bool IsCrossAddrSpaceOrdering,
461 Position Pos) const override;
462 };
463
464 class SIGfx940CacheControl : public SIGfx90ACacheControl {
465 protected:
466
467 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
468 /// is modified, false otherwise.
enableSC0Bit(const MachineBasicBlock::iterator & MI) const469 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
470 return enableNamedBit(MI, AMDGPU::CPol::SC0);
471 }
472
473 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
474 /// is modified, false otherwise.
enableSC1Bit(const MachineBasicBlock::iterator & MI) const475 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
476 return enableNamedBit(MI, AMDGPU::CPol::SC1);
477 }
478
479 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
480 /// is modified, false otherwise.
enableNTBit(const MachineBasicBlock::iterator & MI) const481 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
482 return enableNamedBit(MI, AMDGPU::CPol::NT);
483 }
484
485 public:
486
SIGfx940CacheControl(const GCNSubtarget & ST)487 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
488
489 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
490 SIAtomicScope Scope,
491 SIAtomicAddrSpace AddrSpace) const override;
492
493 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
494 SIAtomicScope Scope,
495 SIAtomicAddrSpace AddrSpace) const override;
496
497 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace) const override;
500
501 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
502 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
503 bool IsVolatile,
504 bool IsNonTemporal) const override;
505
506 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
507 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
508
509 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
510 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
511 Position Pos) const override;
512 };
513
514 class SIGfx10CacheControl : public SIGfx7CacheControl {
515 protected:
516
517 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
518 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const519 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
520 return enableNamedBit(MI, AMDGPU::CPol::DLC);
521 }
522
523 public:
524
SIGfx10CacheControl(const GCNSubtarget & ST)525 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
526
527 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
528 SIAtomicScope Scope,
529 SIAtomicAddrSpace AddrSpace) const override;
530
531 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
532 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
533 bool IsVolatile,
534 bool IsNonTemporal) const override;
535
536 bool insertWait(MachineBasicBlock::iterator &MI,
537 SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace,
539 SIMemOp Op,
540 bool IsCrossAddrSpaceOrdering,
541 Position Pos) const override;
542
543 bool insertAcquire(MachineBasicBlock::iterator &MI,
544 SIAtomicScope Scope,
545 SIAtomicAddrSpace AddrSpace,
546 Position Pos) const override;
547 };
548
549 class SIGfx11CacheControl : public SIGfx10CacheControl {
550 public:
SIGfx11CacheControl(const GCNSubtarget & ST)551 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
552
553 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
554 SIAtomicScope Scope,
555 SIAtomicAddrSpace AddrSpace) const override;
556
557 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
558 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
559 bool IsVolatile,
560 bool IsNonTemporal) const override;
561 };
562
563 class SIMemoryLegalizer final : public MachineFunctionPass {
564 private:
565
566 /// Cache Control.
567 std::unique_ptr<SICacheControl> CC = nullptr;
568
569 /// List of atomic pseudo instructions.
570 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
571
572 /// Return true iff instruction \p MI is a atomic instruction that
573 /// returns a result.
isAtomicRet(const MachineInstr & MI) const574 bool isAtomicRet(const MachineInstr &MI) const {
575 return SIInstrInfo::isAtomicRet(MI);
576 }
577
578 /// Removes all processed atomic pseudo instructions from the current
579 /// function. Returns true if current function is modified, false otherwise.
580 bool removeAtomicPseudoMIs();
581
582 /// Expands load operation \p MI. Returns true if instructions are
583 /// added/deleted or \p MI is modified, false otherwise.
584 bool expandLoad(const SIMemOpInfo &MOI,
585 MachineBasicBlock::iterator &MI);
586 /// Expands store operation \p MI. Returns true if instructions are
587 /// added/deleted or \p MI is modified, false otherwise.
588 bool expandStore(const SIMemOpInfo &MOI,
589 MachineBasicBlock::iterator &MI);
590 /// Expands atomic fence operation \p MI. Returns true if
591 /// instructions are added/deleted or \p MI is modified, false otherwise.
592 bool expandAtomicFence(const SIMemOpInfo &MOI,
593 MachineBasicBlock::iterator &MI);
594 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
595 /// instructions are added/deleted or \p MI is modified, false otherwise.
596 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
597 MachineBasicBlock::iterator &MI);
598
599 public:
600 static char ID;
601
SIMemoryLegalizer()602 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
603
getAnalysisUsage(AnalysisUsage & AU) const604 void getAnalysisUsage(AnalysisUsage &AU) const override {
605 AU.setPreservesCFG();
606 MachineFunctionPass::getAnalysisUsage(AU);
607 }
608
getPassName() const609 StringRef getPassName() const override {
610 return PASS_NAME;
611 }
612
613 bool runOnMachineFunction(MachineFunction &MF) override;
614 };
615
616 } // end namespace anonymous
617
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
619 const char *Msg) const {
620 const Function &Func = MI->getParent()->getParent()->getFunction();
621 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
622 Func.getContext().diagnose(Diag);
623 }
624
625 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
627 SIAtomicAddrSpace InstrAddrSpace) const {
628 if (SSID == SyncScope::System)
629 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
630 if (SSID == MMI->getAgentSSID())
631 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
632 if (SSID == MMI->getWorkgroupSSID())
633 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
634 true);
635 if (SSID == MMI->getWavefrontSSID())
636 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
637 true);
638 if (SSID == SyncScope::SingleThread)
639 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
640 true);
641 if (SSID == MMI->getSystemOneAddressSpaceSSID())
642 return std::tuple(SIAtomicScope::SYSTEM,
643 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
644 if (SSID == MMI->getAgentOneAddressSpaceSSID())
645 return std::tuple(SIAtomicScope::AGENT,
646 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
647 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
648 return std::tuple(SIAtomicScope::WORKGROUP,
649 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
650 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
651 return std::tuple(SIAtomicScope::WAVEFRONT,
652 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
653 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
654 return std::tuple(SIAtomicScope::SINGLETHREAD,
655 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
656 return std::nullopt;
657 }
658
toSIAtomicAddrSpace(unsigned AS) const659 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
660 if (AS == AMDGPUAS::FLAT_ADDRESS)
661 return SIAtomicAddrSpace::FLAT;
662 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
663 return SIAtomicAddrSpace::GLOBAL;
664 if (AS == AMDGPUAS::LOCAL_ADDRESS)
665 return SIAtomicAddrSpace::LDS;
666 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
667 return SIAtomicAddrSpace::SCRATCH;
668 if (AS == AMDGPUAS::REGION_ADDRESS)
669 return SIAtomicAddrSpace::GDS;
670
671 return SIAtomicAddrSpace::OTHER;
672 }
673
SIMemOpAccess(MachineFunction & MF)674 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
675 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
676 }
677
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const678 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
679 const MachineBasicBlock::iterator &MI) const {
680 assert(MI->getNumMemOperands() > 0);
681
682 SyncScope::ID SSID = SyncScope::SingleThread;
683 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
684 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
685 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
686 bool IsNonTemporal = true;
687 bool IsVolatile = false;
688
689 // Validator should check whether or not MMOs cover the entire set of
690 // locations accessed by the memory instruction.
691 for (const auto &MMO : MI->memoperands()) {
692 IsNonTemporal &= MMO->isNonTemporal();
693 IsVolatile |= MMO->isVolatile();
694 InstrAddrSpace |=
695 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
696 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
697 if (OpOrdering != AtomicOrdering::NotAtomic) {
698 const auto &IsSyncScopeInclusion =
699 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
700 if (!IsSyncScopeInclusion) {
701 reportUnsupported(MI,
702 "Unsupported non-inclusive atomic synchronization scope");
703 return std::nullopt;
704 }
705
706 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
707 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
708 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
709 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
710 FailureOrdering =
711 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
712 }
713 }
714
715 SIAtomicScope Scope = SIAtomicScope::NONE;
716 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
717 bool IsCrossAddressSpaceOrdering = false;
718 if (Ordering != AtomicOrdering::NotAtomic) {
719 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
720 if (!ScopeOrNone) {
721 reportUnsupported(MI, "Unsupported atomic synchronization scope");
722 return std::nullopt;
723 }
724 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
725 *ScopeOrNone;
726 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
727 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
728 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
729 reportUnsupported(MI, "Unsupported atomic address space");
730 return std::nullopt;
731 }
732 }
733 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
734 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
735 IsNonTemporal);
736 }
737
738 std::optional<SIMemOpInfo>
getLoadInfo(const MachineBasicBlock::iterator & MI) const739 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
740 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
741
742 if (!(MI->mayLoad() && !MI->mayStore()))
743 return std::nullopt;
744
745 // Be conservative if there are no memory operands.
746 if (MI->getNumMemOperands() == 0)
747 return SIMemOpInfo();
748
749 return constructFromMIWithMMO(MI);
750 }
751
752 std::optional<SIMemOpInfo>
getStoreInfo(const MachineBasicBlock::iterator & MI) const753 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
754 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
755
756 if (!(!MI->mayLoad() && MI->mayStore()))
757 return std::nullopt;
758
759 // Be conservative if there are no memory operands.
760 if (MI->getNumMemOperands() == 0)
761 return SIMemOpInfo();
762
763 return constructFromMIWithMMO(MI);
764 }
765
766 std::optional<SIMemOpInfo>
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const767 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
768 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
769
770 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
771 return std::nullopt;
772
773 AtomicOrdering Ordering =
774 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
775
776 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
777 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
778 if (!ScopeOrNone) {
779 reportUnsupported(MI, "Unsupported atomic synchronization scope");
780 return std::nullopt;
781 }
782
783 SIAtomicScope Scope = SIAtomicScope::NONE;
784 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
785 bool IsCrossAddressSpaceOrdering = false;
786 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
787 *ScopeOrNone;
788
789 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
790 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
791 reportUnsupported(MI, "Unsupported atomic address space");
792 return std::nullopt;
793 }
794
795 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
796 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
797 }
798
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const799 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
800 const MachineBasicBlock::iterator &MI) const {
801 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
802
803 if (!(MI->mayLoad() && MI->mayStore()))
804 return std::nullopt;
805
806 // Be conservative if there are no memory operands.
807 if (MI->getNumMemOperands() == 0)
808 return SIMemOpInfo();
809
810 return constructFromMIWithMMO(MI);
811 }
812
SICacheControl(const GCNSubtarget & ST)813 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
814 TII = ST.getInstrInfo();
815 IV = getIsaVersion(ST.getCPU());
816 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
817 }
818
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const819 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
820 AMDGPU::CPol::CPol Bit) const {
821 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
822 if (!CPol)
823 return false;
824
825 CPol->setImm(CPol->getImm() | Bit);
826 return true;
827 }
828
829 /* static */
create(const GCNSubtarget & ST)830 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
831 GCNSubtarget::Generation Generation = ST.getGeneration();
832 if (ST.hasGFX940Insts())
833 return std::make_unique<SIGfx940CacheControl>(ST);
834 if (ST.hasGFX90AInsts())
835 return std::make_unique<SIGfx90ACacheControl>(ST);
836 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
837 return std::make_unique<SIGfx6CacheControl>(ST);
838 if (Generation < AMDGPUSubtarget::GFX10)
839 return std::make_unique<SIGfx7CacheControl>(ST);
840 if (Generation < AMDGPUSubtarget::GFX11)
841 return std::make_unique<SIGfx10CacheControl>(ST);
842 return std::make_unique<SIGfx11CacheControl>(ST);
843 }
844
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const845 bool SIGfx6CacheControl::enableLoadCacheBypass(
846 const MachineBasicBlock::iterator &MI,
847 SIAtomicScope Scope,
848 SIAtomicAddrSpace AddrSpace) const {
849 assert(MI->mayLoad() && !MI->mayStore());
850 bool Changed = false;
851
852 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
853 switch (Scope) {
854 case SIAtomicScope::SYSTEM:
855 case SIAtomicScope::AGENT:
856 // Set L1 cache policy to MISS_EVICT.
857 // Note: there is no L2 cache bypass policy at the ISA level.
858 Changed |= enableGLCBit(MI);
859 break;
860 case SIAtomicScope::WORKGROUP:
861 case SIAtomicScope::WAVEFRONT:
862 case SIAtomicScope::SINGLETHREAD:
863 // No cache to bypass.
864 break;
865 default:
866 llvm_unreachable("Unsupported synchronization scope");
867 }
868 }
869
870 /// The scratch address space does not need the global memory caches
871 /// to be bypassed as all memory operations by the same thread are
872 /// sequentially consistent, and no other thread can access scratch
873 /// memory.
874
875 /// Other address spaces do not have a cache.
876
877 return Changed;
878 }
879
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const880 bool SIGfx6CacheControl::enableStoreCacheBypass(
881 const MachineBasicBlock::iterator &MI,
882 SIAtomicScope Scope,
883 SIAtomicAddrSpace AddrSpace) const {
884 assert(!MI->mayLoad() && MI->mayStore());
885 bool Changed = false;
886
887 /// The L1 cache is write through so does not need to be bypassed. There is no
888 /// bypass control for the L2 cache at the isa level.
889
890 return Changed;
891 }
892
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const893 bool SIGfx6CacheControl::enableRMWCacheBypass(
894 const MachineBasicBlock::iterator &MI,
895 SIAtomicScope Scope,
896 SIAtomicAddrSpace AddrSpace) const {
897 assert(MI->mayLoad() && MI->mayStore());
898 bool Changed = false;
899
900 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
901 /// bypassed, and the GLC bit is instead used to indicate if they are
902 /// return or no-return.
903 /// Note: there is no L2 cache coherent bypass control at the ISA level.
904
905 return Changed;
906 }
907
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const908 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
909 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
910 bool IsVolatile, bool IsNonTemporal) const {
911 // Only handle load and store, not atomic read-modify-write insructions. The
912 // latter use glc to indicate if the atomic returns a result and so must not
913 // be used for cache control.
914 assert(MI->mayLoad() ^ MI->mayStore());
915
916 // Only update load and store, not LLVM IR atomic read-modify-write
917 // instructions. The latter are always marked as volatile so cannot sensibly
918 // handle it as do not want to pessimize all atomics. Also they do not support
919 // the nontemporal attribute.
920 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
921
922 bool Changed = false;
923
924 if (IsVolatile) {
925 // Set L1 cache policy to be MISS_EVICT for load instructions
926 // and MISS_LRU for store instructions.
927 // Note: there is no L2 cache bypass policy at the ISA level.
928 if (Op == SIMemOp::LOAD)
929 Changed |= enableGLCBit(MI);
930
931 // Ensure operation has completed at system scope to cause all volatile
932 // operations to be visible outside the program in a global order. Do not
933 // request cross address space as only the global address space can be
934 // observable outside the program, so no need to cause a waitcnt for LDS
935 // address space operations.
936 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
937 Position::AFTER);
938
939 return Changed;
940 }
941
942 if (IsNonTemporal) {
943 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
944 // for both loads and stores, and the L2 cache policy to STREAM.
945 Changed |= enableGLCBit(MI);
946 Changed |= enableSLCBit(MI);
947 return Changed;
948 }
949
950 return Changed;
951 }
952
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const953 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
954 SIAtomicScope Scope,
955 SIAtomicAddrSpace AddrSpace,
956 SIMemOp Op,
957 bool IsCrossAddrSpaceOrdering,
958 Position Pos) const {
959 bool Changed = false;
960
961 MachineBasicBlock &MBB = *MI->getParent();
962 DebugLoc DL = MI->getDebugLoc();
963
964 if (Pos == Position::AFTER)
965 ++MI;
966
967 bool VMCnt = false;
968 bool LGKMCnt = false;
969
970 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
971 SIAtomicAddrSpace::NONE) {
972 switch (Scope) {
973 case SIAtomicScope::SYSTEM:
974 case SIAtomicScope::AGENT:
975 VMCnt |= true;
976 break;
977 case SIAtomicScope::WORKGROUP:
978 case SIAtomicScope::WAVEFRONT:
979 case SIAtomicScope::SINGLETHREAD:
980 // The L1 cache keeps all memory operations in order for
981 // wavefronts in the same work-group.
982 break;
983 default:
984 llvm_unreachable("Unsupported synchronization scope");
985 }
986 }
987
988 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
989 switch (Scope) {
990 case SIAtomicScope::SYSTEM:
991 case SIAtomicScope::AGENT:
992 case SIAtomicScope::WORKGROUP:
993 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
994 // not needed as LDS operations for all waves are executed in a total
995 // global ordering as observed by all waves. Required if also
996 // synchronizing with global/GDS memory as LDS operations could be
997 // reordered with respect to later global/GDS memory operations of the
998 // same wave.
999 LGKMCnt |= IsCrossAddrSpaceOrdering;
1000 break;
1001 case SIAtomicScope::WAVEFRONT:
1002 case SIAtomicScope::SINGLETHREAD:
1003 // The LDS keeps all memory operations in order for
1004 // the same wavefront.
1005 break;
1006 default:
1007 llvm_unreachable("Unsupported synchronization scope");
1008 }
1009 }
1010
1011 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1012 switch (Scope) {
1013 case SIAtomicScope::SYSTEM:
1014 case SIAtomicScope::AGENT:
1015 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1016 // is not needed as GDS operations for all waves are executed in a total
1017 // global ordering as observed by all waves. Required if also
1018 // synchronizing with global/LDS memory as GDS operations could be
1019 // reordered with respect to later global/LDS memory operations of the
1020 // same wave.
1021 LGKMCnt |= IsCrossAddrSpaceOrdering;
1022 break;
1023 case SIAtomicScope::WORKGROUP:
1024 case SIAtomicScope::WAVEFRONT:
1025 case SIAtomicScope::SINGLETHREAD:
1026 // The GDS keeps all memory operations in order for
1027 // the same work-group.
1028 break;
1029 default:
1030 llvm_unreachable("Unsupported synchronization scope");
1031 }
1032 }
1033
1034 if (VMCnt || LGKMCnt) {
1035 unsigned WaitCntImmediate =
1036 AMDGPU::encodeWaitcnt(IV,
1037 VMCnt ? 0 : getVmcntBitMask(IV),
1038 getExpcntBitMask(IV),
1039 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1040 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1041 Changed = true;
1042 }
1043
1044 if (Pos == Position::AFTER)
1045 --MI;
1046
1047 return Changed;
1048 }
1049
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1050 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1051 SIAtomicScope Scope,
1052 SIAtomicAddrSpace AddrSpace,
1053 Position Pos) const {
1054 if (!InsertCacheInv)
1055 return false;
1056
1057 bool Changed = false;
1058
1059 MachineBasicBlock &MBB = *MI->getParent();
1060 DebugLoc DL = MI->getDebugLoc();
1061
1062 if (Pos == Position::AFTER)
1063 ++MI;
1064
1065 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1066 switch (Scope) {
1067 case SIAtomicScope::SYSTEM:
1068 case SIAtomicScope::AGENT:
1069 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1070 Changed = true;
1071 break;
1072 case SIAtomicScope::WORKGROUP:
1073 case SIAtomicScope::WAVEFRONT:
1074 case SIAtomicScope::SINGLETHREAD:
1075 // No cache to invalidate.
1076 break;
1077 default:
1078 llvm_unreachable("Unsupported synchronization scope");
1079 }
1080 }
1081
1082 /// The scratch address space does not need the global memory cache
1083 /// to be flushed as all memory operations by the same thread are
1084 /// sequentially consistent, and no other thread can access scratch
1085 /// memory.
1086
1087 /// Other address spaces do not have a cache.
1088
1089 if (Pos == Position::AFTER)
1090 --MI;
1091
1092 return Changed;
1093 }
1094
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1095 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1096 SIAtomicScope Scope,
1097 SIAtomicAddrSpace AddrSpace,
1098 bool IsCrossAddrSpaceOrdering,
1099 Position Pos) const {
1100 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1101 IsCrossAddrSpaceOrdering, Pos);
1102 }
1103
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1104 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1105 SIAtomicScope Scope,
1106 SIAtomicAddrSpace AddrSpace,
1107 Position Pos) const {
1108 if (!InsertCacheInv)
1109 return false;
1110
1111 bool Changed = false;
1112
1113 MachineBasicBlock &MBB = *MI->getParent();
1114 DebugLoc DL = MI->getDebugLoc();
1115
1116 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1117
1118 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1119 ? AMDGPU::BUFFER_WBINVL1
1120 : AMDGPU::BUFFER_WBINVL1_VOL;
1121
1122 if (Pos == Position::AFTER)
1123 ++MI;
1124
1125 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1126 switch (Scope) {
1127 case SIAtomicScope::SYSTEM:
1128 case SIAtomicScope::AGENT:
1129 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1130 Changed = true;
1131 break;
1132 case SIAtomicScope::WORKGROUP:
1133 case SIAtomicScope::WAVEFRONT:
1134 case SIAtomicScope::SINGLETHREAD:
1135 // No cache to invalidate.
1136 break;
1137 default:
1138 llvm_unreachable("Unsupported synchronization scope");
1139 }
1140 }
1141
1142 /// The scratch address space does not need the global memory cache
1143 /// to be flushed as all memory operations by the same thread are
1144 /// sequentially consistent, and no other thread can access scratch
1145 /// memory.
1146
1147 /// Other address spaces do not have a cache.
1148
1149 if (Pos == Position::AFTER)
1150 --MI;
1151
1152 return Changed;
1153 }
1154
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1155 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1156 const MachineBasicBlock::iterator &MI,
1157 SIAtomicScope Scope,
1158 SIAtomicAddrSpace AddrSpace) const {
1159 assert(MI->mayLoad() && !MI->mayStore());
1160 bool Changed = false;
1161
1162 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1163 switch (Scope) {
1164 case SIAtomicScope::SYSTEM:
1165 case SIAtomicScope::AGENT:
1166 // Set the L1 cache policy to MISS_LRU.
1167 // Note: there is no L2 cache bypass policy at the ISA level.
1168 Changed |= enableGLCBit(MI);
1169 break;
1170 case SIAtomicScope::WORKGROUP:
1171 // In threadgroup split mode the waves of a work-group can be executing on
1172 // different CUs. Therefore need to bypass the L1 which is per CU.
1173 // Otherwise in non-threadgroup split mode all waves of a work-group are
1174 // on the same CU, and so the L1 does not need to be bypassed.
1175 if (ST.isTgSplitEnabled())
1176 Changed |= enableGLCBit(MI);
1177 break;
1178 case SIAtomicScope::WAVEFRONT:
1179 case SIAtomicScope::SINGLETHREAD:
1180 // No cache to bypass.
1181 break;
1182 default:
1183 llvm_unreachable("Unsupported synchronization scope");
1184 }
1185 }
1186
1187 /// The scratch address space does not need the global memory caches
1188 /// to be bypassed as all memory operations by the same thread are
1189 /// sequentially consistent, and no other thread can access scratch
1190 /// memory.
1191
1192 /// Other address spaces do not have a cache.
1193
1194 return Changed;
1195 }
1196
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1197 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1198 const MachineBasicBlock::iterator &MI,
1199 SIAtomicScope Scope,
1200 SIAtomicAddrSpace AddrSpace) const {
1201 assert(!MI->mayLoad() && MI->mayStore());
1202 bool Changed = false;
1203
1204 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205 switch (Scope) {
1206 case SIAtomicScope::SYSTEM:
1207 case SIAtomicScope::AGENT:
1208 /// Do not set glc for store atomic operations as they implicitly write
1209 /// through the L1 cache.
1210 break;
1211 case SIAtomicScope::WORKGROUP:
1212 case SIAtomicScope::WAVEFRONT:
1213 case SIAtomicScope::SINGLETHREAD:
1214 // No cache to bypass. Store atomics implicitly write through the L1
1215 // cache.
1216 break;
1217 default:
1218 llvm_unreachable("Unsupported synchronization scope");
1219 }
1220 }
1221
1222 /// The scratch address space does not need the global memory caches
1223 /// to be bypassed as all memory operations by the same thread are
1224 /// sequentially consistent, and no other thread can access scratch
1225 /// memory.
1226
1227 /// Other address spaces do not have a cache.
1228
1229 return Changed;
1230 }
1231
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1232 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1233 const MachineBasicBlock::iterator &MI,
1234 SIAtomicScope Scope,
1235 SIAtomicAddrSpace AddrSpace) const {
1236 assert(MI->mayLoad() && MI->mayStore());
1237 bool Changed = false;
1238
1239 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1240 switch (Scope) {
1241 case SIAtomicScope::SYSTEM:
1242 case SIAtomicScope::AGENT:
1243 /// Do not set glc for RMW atomic operations as they implicitly bypass
1244 /// the L1 cache, and the glc bit is instead used to indicate if they are
1245 /// return or no-return.
1246 break;
1247 case SIAtomicScope::WORKGROUP:
1248 case SIAtomicScope::WAVEFRONT:
1249 case SIAtomicScope::SINGLETHREAD:
1250 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1251 break;
1252 default:
1253 llvm_unreachable("Unsupported synchronization scope");
1254 }
1255 }
1256
1257 return Changed;
1258 }
1259
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1260 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1261 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1262 bool IsVolatile, bool IsNonTemporal) const {
1263 // Only handle load and store, not atomic read-modify-write insructions. The
1264 // latter use glc to indicate if the atomic returns a result and so must not
1265 // be used for cache control.
1266 assert(MI->mayLoad() ^ MI->mayStore());
1267
1268 // Only update load and store, not LLVM IR atomic read-modify-write
1269 // instructions. The latter are always marked as volatile so cannot sensibly
1270 // handle it as do not want to pessimize all atomics. Also they do not support
1271 // the nontemporal attribute.
1272 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1273
1274 bool Changed = false;
1275
1276 if (IsVolatile) {
1277 // Set L1 cache policy to be MISS_EVICT for load instructions
1278 // and MISS_LRU for store instructions.
1279 // Note: there is no L2 cache bypass policy at the ISA level.
1280 if (Op == SIMemOp::LOAD)
1281 Changed |= enableGLCBit(MI);
1282
1283 // Ensure operation has completed at system scope to cause all volatile
1284 // operations to be visible outside the program in a global order. Do not
1285 // request cross address space as only the global address space can be
1286 // observable outside the program, so no need to cause a waitcnt for LDS
1287 // address space operations.
1288 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1289 Position::AFTER);
1290
1291 return Changed;
1292 }
1293
1294 if (IsNonTemporal) {
1295 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1296 // for both loads and stores, and the L2 cache policy to STREAM.
1297 Changed |= enableGLCBit(MI);
1298 Changed |= enableSLCBit(MI);
1299 return Changed;
1300 }
1301
1302 return Changed;
1303 }
1304
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1305 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1306 SIAtomicScope Scope,
1307 SIAtomicAddrSpace AddrSpace,
1308 SIMemOp Op,
1309 bool IsCrossAddrSpaceOrdering,
1310 Position Pos) const {
1311 if (ST.isTgSplitEnabled()) {
1312 // In threadgroup split mode the waves of a work-group can be executing on
1313 // different CUs. Therefore need to wait for global or GDS memory operations
1314 // to complete to ensure they are visible to waves in the other CUs.
1315 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1316 // the same CU, so no need to wait for global memory as all waves in the
1317 // work-group access the same the L1, nor wait for GDS as access are ordered
1318 // on a CU.
1319 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1320 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1321 (Scope == SIAtomicScope::WORKGROUP)) {
1322 // Same as GFX7 using agent scope.
1323 Scope = SIAtomicScope::AGENT;
1324 }
1325 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1326 // LDS memory operations.
1327 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1328 }
1329 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1330 IsCrossAddrSpaceOrdering, Pos);
1331 }
1332
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1333 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1334 SIAtomicScope Scope,
1335 SIAtomicAddrSpace AddrSpace,
1336 Position Pos) const {
1337 if (!InsertCacheInv)
1338 return false;
1339
1340 bool Changed = false;
1341
1342 MachineBasicBlock &MBB = *MI->getParent();
1343 DebugLoc DL = MI->getDebugLoc();
1344
1345 if (Pos == Position::AFTER)
1346 ++MI;
1347
1348 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1349 switch (Scope) {
1350 case SIAtomicScope::SYSTEM:
1351 // Ensures that following loads will not see stale remote VMEM data or
1352 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1353 // CC will never be stale due to the local memory probes.
1354 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1355 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1356 // hardware does not reorder memory operations by the same wave with
1357 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1358 // remove any cache lines of earlier writes by the same wave and ensures
1359 // later reads by the same wave will refetch the cache lines.
1360 Changed = true;
1361 break;
1362 case SIAtomicScope::AGENT:
1363 // Same as GFX7.
1364 break;
1365 case SIAtomicScope::WORKGROUP:
1366 // In threadgroup split mode the waves of a work-group can be executing on
1367 // different CUs. Therefore need to invalidate the L1 which is per CU.
1368 // Otherwise in non-threadgroup split mode all waves of a work-group are
1369 // on the same CU, and so the L1 does not need to be invalidated.
1370 if (ST.isTgSplitEnabled()) {
1371 // Same as GFX7 using agent scope.
1372 Scope = SIAtomicScope::AGENT;
1373 }
1374 break;
1375 case SIAtomicScope::WAVEFRONT:
1376 case SIAtomicScope::SINGLETHREAD:
1377 // Same as GFX7.
1378 break;
1379 default:
1380 llvm_unreachable("Unsupported synchronization scope");
1381 }
1382 }
1383
1384 /// The scratch address space does not need the global memory cache
1385 /// to be flushed as all memory operations by the same thread are
1386 /// sequentially consistent, and no other thread can access scratch
1387 /// memory.
1388
1389 /// Other address spaces do not have a cache.
1390
1391 if (Pos == Position::AFTER)
1392 --MI;
1393
1394 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1395
1396 return Changed;
1397 }
1398
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1399 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1400 SIAtomicScope Scope,
1401 SIAtomicAddrSpace AddrSpace,
1402 bool IsCrossAddrSpaceOrdering,
1403 Position Pos) const {
1404 bool Changed = false;
1405
1406 MachineBasicBlock &MBB = *MI->getParent();
1407 DebugLoc DL = MI->getDebugLoc();
1408
1409 if (Pos == Position::AFTER)
1410 ++MI;
1411
1412 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1413 switch (Scope) {
1414 case SIAtomicScope::SYSTEM:
1415 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1416 // hardware does not reorder memory operations by the same wave with
1417 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1418 // to initiate writeback of any dirty cache lines of earlier writes by the
1419 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1420 // writeback has completed.
1421 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1422 // Set SC bits to indicate system scope.
1423 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1424 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1425 // vmcnt(0)" needed by the "BUFFER_WBL2".
1426 Changed = true;
1427 break;
1428 case SIAtomicScope::AGENT:
1429 case SIAtomicScope::WORKGROUP:
1430 case SIAtomicScope::WAVEFRONT:
1431 case SIAtomicScope::SINGLETHREAD:
1432 // Same as GFX7.
1433 break;
1434 default:
1435 llvm_unreachable("Unsupported synchronization scope");
1436 }
1437 }
1438
1439 if (Pos == Position::AFTER)
1440 --MI;
1441
1442 Changed |=
1443 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1444 IsCrossAddrSpaceOrdering, Pos);
1445
1446 return Changed;
1447 }
1448
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1449 bool SIGfx940CacheControl::enableLoadCacheBypass(
1450 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1451 SIAtomicAddrSpace AddrSpace) const {
1452 assert(MI->mayLoad() && !MI->mayStore());
1453 bool Changed = false;
1454
1455 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1456 switch (Scope) {
1457 case SIAtomicScope::SYSTEM:
1458 // Set SC bits to indicate system scope.
1459 Changed |= enableSC0Bit(MI);
1460 Changed |= enableSC1Bit(MI);
1461 break;
1462 case SIAtomicScope::AGENT:
1463 // Set SC bits to indicate agent scope.
1464 Changed |= enableSC1Bit(MI);
1465 break;
1466 case SIAtomicScope::WORKGROUP:
1467 // In threadgroup split mode the waves of a work-group can be executing on
1468 // different CUs. Therefore need to bypass the L1 which is per CU.
1469 // Otherwise in non-threadgroup split mode all waves of a work-group are
1470 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1471 // bits to indicate work-group scope will do this automatically.
1472 Changed |= enableSC0Bit(MI);
1473 break;
1474 case SIAtomicScope::WAVEFRONT:
1475 case SIAtomicScope::SINGLETHREAD:
1476 // Leave SC bits unset to indicate wavefront scope.
1477 break;
1478 default:
1479 llvm_unreachable("Unsupported synchronization scope");
1480 }
1481 }
1482
1483 /// The scratch address space does not need the global memory caches
1484 /// to be bypassed as all memory operations by the same thread are
1485 /// sequentially consistent, and no other thread can access scratch
1486 /// memory.
1487
1488 /// Other address spaces do not have a cache.
1489
1490 return Changed;
1491 }
1492
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1493 bool SIGfx940CacheControl::enableStoreCacheBypass(
1494 const MachineBasicBlock::iterator &MI,
1495 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1496 assert(!MI->mayLoad() && MI->mayStore());
1497 bool Changed = false;
1498
1499 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1500 switch (Scope) {
1501 case SIAtomicScope::SYSTEM:
1502 // Set SC bits to indicate system scope.
1503 Changed |= enableSC0Bit(MI);
1504 Changed |= enableSC1Bit(MI);
1505 break;
1506 case SIAtomicScope::AGENT:
1507 // Set SC bits to indicate agent scope.
1508 Changed |= enableSC1Bit(MI);
1509 break;
1510 case SIAtomicScope::WORKGROUP:
1511 // Set SC bits to indicate workgroup scope.
1512 Changed |= enableSC0Bit(MI);
1513 break;
1514 case SIAtomicScope::WAVEFRONT:
1515 case SIAtomicScope::SINGLETHREAD:
1516 // Leave SC bits unset to indicate wavefront scope.
1517 break;
1518 default:
1519 llvm_unreachable("Unsupported synchronization scope");
1520 }
1521 }
1522
1523 /// The scratch address space does not need the global memory caches
1524 /// to be bypassed as all memory operations by the same thread are
1525 /// sequentially consistent, and no other thread can access scratch
1526 /// memory.
1527
1528 /// Other address spaces do not have a cache.
1529
1530 return Changed;
1531 }
1532
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1533 bool SIGfx940CacheControl::enableRMWCacheBypass(
1534 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1535 SIAtomicAddrSpace AddrSpace) const {
1536 assert(MI->mayLoad() && MI->mayStore());
1537 bool Changed = false;
1538
1539 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1540 switch (Scope) {
1541 case SIAtomicScope::SYSTEM:
1542 // Set SC1 bit to indicate system scope.
1543 Changed |= enableSC1Bit(MI);
1544 break;
1545 case SIAtomicScope::AGENT:
1546 case SIAtomicScope::WORKGROUP:
1547 case SIAtomicScope::WAVEFRONT:
1548 case SIAtomicScope::SINGLETHREAD:
1549 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1550 // to indicate system or agent scope. The SC0 bit is used to indicate if
1551 // they are return or no-return. Leave SC1 bit unset to indicate agent
1552 // scope.
1553 break;
1554 default:
1555 llvm_unreachable("Unsupported synchronization scope");
1556 }
1557 }
1558
1559 return Changed;
1560 }
1561
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1562 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1563 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1564 bool IsVolatile, bool IsNonTemporal) const {
1565 // Only handle load and store, not atomic read-modify-write insructions. The
1566 // latter use glc to indicate if the atomic returns a result and so must not
1567 // be used for cache control.
1568 assert(MI->mayLoad() ^ MI->mayStore());
1569
1570 // Only update load and store, not LLVM IR atomic read-modify-write
1571 // instructions. The latter are always marked as volatile so cannot sensibly
1572 // handle it as do not want to pessimize all atomics. Also they do not support
1573 // the nontemporal attribute.
1574 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1575
1576 bool Changed = false;
1577
1578 if (IsVolatile) {
1579 // Set SC bits to indicate system scope.
1580 Changed |= enableSC0Bit(MI);
1581 Changed |= enableSC1Bit(MI);
1582
1583 // Ensure operation has completed at system scope to cause all volatile
1584 // operations to be visible outside the program in a global order. Do not
1585 // request cross address space as only the global address space can be
1586 // observable outside the program, so no need to cause a waitcnt for LDS
1587 // address space operations.
1588 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1589 Position::AFTER);
1590
1591 return Changed;
1592 }
1593
1594 if (IsNonTemporal) {
1595 Changed |= enableNTBit(MI);
1596 return Changed;
1597 }
1598
1599 return Changed;
1600 }
1601
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1602 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1603 SIAtomicScope Scope,
1604 SIAtomicAddrSpace AddrSpace,
1605 Position Pos) const {
1606 if (!InsertCacheInv)
1607 return false;
1608
1609 bool Changed = false;
1610
1611 MachineBasicBlock &MBB = *MI->getParent();
1612 DebugLoc DL = MI->getDebugLoc();
1613
1614 if (Pos == Position::AFTER)
1615 ++MI;
1616
1617 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1618 switch (Scope) {
1619 case SIAtomicScope::SYSTEM:
1620 // Ensures that following loads will not see stale remote VMEM data or
1621 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1622 // CC will never be stale due to the local memory probes.
1623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1624 // Set SC bits to indicate system scope.
1625 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1626 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1627 // hardware does not reorder memory operations by the same wave with
1628 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1629 // remove any cache lines of earlier writes by the same wave and ensures
1630 // later reads by the same wave will refetch the cache lines.
1631 Changed = true;
1632 break;
1633 case SIAtomicScope::AGENT:
1634 // Ensures that following loads will not see stale remote date or local
1635 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1636 // due to the memory probes.
1637 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1638 // Set SC bits to indicate agent scope.
1639 .addImm(AMDGPU::CPol::SC1);
1640 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1641 // does not reorder memory operations with respect to preceeding buffer
1642 // invalidate. The invalidate is guaranteed to remove any cache lines of
1643 // earlier writes and ensures later writes will refetch the cache lines.
1644 Changed = true;
1645 break;
1646 case SIAtomicScope::WORKGROUP:
1647 // In threadgroup split mode the waves of a work-group can be executing on
1648 // different CUs. Therefore need to invalidate the L1 which is per CU.
1649 // Otherwise in non-threadgroup split mode all waves of a work-group are
1650 // on the same CU, and so the L1 does not need to be invalidated.
1651 if (ST.isTgSplitEnabled()) {
1652 // Ensures L1 is invalidated if in threadgroup split mode. In
1653 // non-threadgroup split mode it is a NOP, but no point generating it in
1654 // that case if know not in that mode.
1655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1656 // Set SC bits to indicate work-group scope.
1657 .addImm(AMDGPU::CPol::SC0);
1658 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1659 // does not reorder memory operations with respect to preceeding buffer
1660 // invalidate. The invalidate is guaranteed to remove any cache lines of
1661 // earlier writes and ensures later writes will refetch the cache lines.
1662 Changed = true;
1663 }
1664 break;
1665 case SIAtomicScope::WAVEFRONT:
1666 case SIAtomicScope::SINGLETHREAD:
1667 // Could generate "BUFFER_INV" but it would do nothing as there are no
1668 // caches to invalidate.
1669 break;
1670 default:
1671 llvm_unreachable("Unsupported synchronization scope");
1672 }
1673 }
1674
1675 /// The scratch address space does not need the global memory cache
1676 /// to be flushed as all memory operations by the same thread are
1677 /// sequentially consistent, and no other thread can access scratch
1678 /// memory.
1679
1680 /// Other address spaces do not have a cache.
1681
1682 if (Pos == Position::AFTER)
1683 --MI;
1684
1685 return Changed;
1686 }
1687
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1688 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1689 SIAtomicScope Scope,
1690 SIAtomicAddrSpace AddrSpace,
1691 bool IsCrossAddrSpaceOrdering,
1692 Position Pos) const {
1693 bool Changed = false;
1694
1695 MachineBasicBlock &MBB = *MI->getParent();
1696 DebugLoc DL = MI->getDebugLoc();
1697
1698 if (Pos == Position::AFTER)
1699 ++MI;
1700
1701 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1702 switch (Scope) {
1703 case SIAtomicScope::SYSTEM:
1704 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1705 // hardware does not reorder memory operations by the same wave with
1706 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1707 // to initiate writeback of any dirty cache lines of earlier writes by the
1708 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1709 // writeback has completed.
1710 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1711 // Set SC bits to indicate system scope.
1712 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1713 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1714 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1715 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1716 Changed = true;
1717 break;
1718 case SIAtomicScope::AGENT:
1719 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1720 // Set SC bits to indicate agent scope.
1721 .addImm(AMDGPU::CPol::SC1);
1722
1723 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1724 // SIAtomicScope::AGENT, the following insertWait will generate the
1725 // required "S_WAITCNT vmcnt(0)".
1726 Changed = true;
1727 break;
1728 case SIAtomicScope::WORKGROUP:
1729 case SIAtomicScope::WAVEFRONT:
1730 case SIAtomicScope::SINGLETHREAD:
1731 // Do not generate "BUFFER_WBL2" as there are no caches it would
1732 // writeback, and would require an otherwise unnecessary
1733 // "S_WAITCNT vmcnt(0)".
1734 break;
1735 default:
1736 llvm_unreachable("Unsupported synchronization scope");
1737 }
1738 }
1739
1740 if (Pos == Position::AFTER)
1741 --MI;
1742
1743 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1744 // S_WAITCNT needed.
1745 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1746 IsCrossAddrSpaceOrdering, Pos);
1747
1748 return Changed;
1749 }
1750
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1751 bool SIGfx10CacheControl::enableLoadCacheBypass(
1752 const MachineBasicBlock::iterator &MI,
1753 SIAtomicScope Scope,
1754 SIAtomicAddrSpace AddrSpace) const {
1755 assert(MI->mayLoad() && !MI->mayStore());
1756 bool Changed = false;
1757
1758 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1759 switch (Scope) {
1760 case SIAtomicScope::SYSTEM:
1761 case SIAtomicScope::AGENT:
1762 // Set the L0 and L1 cache policies to MISS_EVICT.
1763 // Note: there is no L2 cache coherent bypass control at the ISA level.
1764 Changed |= enableGLCBit(MI);
1765 Changed |= enableDLCBit(MI);
1766 break;
1767 case SIAtomicScope::WORKGROUP:
1768 // In WGP mode the waves of a work-group can be executing on either CU of
1769 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1770 // CU mode all waves of a work-group are on the same CU, and so the L0
1771 // does not need to be bypassed.
1772 if (!ST.isCuModeEnabled())
1773 Changed |= enableGLCBit(MI);
1774 break;
1775 case SIAtomicScope::WAVEFRONT:
1776 case SIAtomicScope::SINGLETHREAD:
1777 // No cache to bypass.
1778 break;
1779 default:
1780 llvm_unreachable("Unsupported synchronization scope");
1781 }
1782 }
1783
1784 /// The scratch address space does not need the global memory caches
1785 /// to be bypassed as all memory operations by the same thread are
1786 /// sequentially consistent, and no other thread can access scratch
1787 /// memory.
1788
1789 /// Other address spaces do not have a cache.
1790
1791 return Changed;
1792 }
1793
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1794 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1795 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1796 bool IsVolatile, bool IsNonTemporal) const {
1797
1798 // Only handle load and store, not atomic read-modify-write insructions. The
1799 // latter use glc to indicate if the atomic returns a result and so must not
1800 // be used for cache control.
1801 assert(MI->mayLoad() ^ MI->mayStore());
1802
1803 // Only update load and store, not LLVM IR atomic read-modify-write
1804 // instructions. The latter are always marked as volatile so cannot sensibly
1805 // handle it as do not want to pessimize all atomics. Also they do not support
1806 // the nontemporal attribute.
1807 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1808
1809 bool Changed = false;
1810
1811 if (IsVolatile) {
1812 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1813 // and MISS_LRU for store instructions.
1814 // Note: there is no L2 cache coherent bypass control at the ISA level.
1815 if (Op == SIMemOp::LOAD) {
1816 Changed |= enableGLCBit(MI);
1817 Changed |= enableDLCBit(MI);
1818 }
1819
1820 // Ensure operation has completed at system scope to cause all volatile
1821 // operations to be visible outside the program in a global order. Do not
1822 // request cross address space as only the global address space can be
1823 // observable outside the program, so no need to cause a waitcnt for LDS
1824 // address space operations.
1825 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1826 Position::AFTER);
1827 return Changed;
1828 }
1829
1830 if (IsNonTemporal) {
1831 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1832 // and L2 cache policy to STREAM.
1833 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1834 // to MISS_EVICT and the L2 cache policy to STREAM.
1835 if (Op == SIMemOp::STORE)
1836 Changed |= enableGLCBit(MI);
1837 Changed |= enableSLCBit(MI);
1838
1839 return Changed;
1840 }
1841
1842 return Changed;
1843 }
1844
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1845 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1846 SIAtomicScope Scope,
1847 SIAtomicAddrSpace AddrSpace,
1848 SIMemOp Op,
1849 bool IsCrossAddrSpaceOrdering,
1850 Position Pos) const {
1851 bool Changed = false;
1852
1853 MachineBasicBlock &MBB = *MI->getParent();
1854 DebugLoc DL = MI->getDebugLoc();
1855
1856 if (Pos == Position::AFTER)
1857 ++MI;
1858
1859 bool VMCnt = false;
1860 bool VSCnt = false;
1861 bool LGKMCnt = false;
1862
1863 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1864 SIAtomicAddrSpace::NONE) {
1865 switch (Scope) {
1866 case SIAtomicScope::SYSTEM:
1867 case SIAtomicScope::AGENT:
1868 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1869 VMCnt |= true;
1870 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1871 VSCnt |= true;
1872 break;
1873 case SIAtomicScope::WORKGROUP:
1874 // In WGP mode the waves of a work-group can be executing on either CU of
1875 // the WGP. Therefore need to wait for operations to complete to ensure
1876 // they are visible to waves in the other CU as the L0 is per CU.
1877 // Otherwise in CU mode and all waves of a work-group are on the same CU
1878 // which shares the same L0.
1879 if (!ST.isCuModeEnabled()) {
1880 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1881 VMCnt |= true;
1882 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1883 VSCnt |= true;
1884 }
1885 break;
1886 case SIAtomicScope::WAVEFRONT:
1887 case SIAtomicScope::SINGLETHREAD:
1888 // The L0 cache keeps all memory operations in order for
1889 // work-items in the same wavefront.
1890 break;
1891 default:
1892 llvm_unreachable("Unsupported synchronization scope");
1893 }
1894 }
1895
1896 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1897 switch (Scope) {
1898 case SIAtomicScope::SYSTEM:
1899 case SIAtomicScope::AGENT:
1900 case SIAtomicScope::WORKGROUP:
1901 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1902 // not needed as LDS operations for all waves are executed in a total
1903 // global ordering as observed by all waves. Required if also
1904 // synchronizing with global/GDS memory as LDS operations could be
1905 // reordered with respect to later global/GDS memory operations of the
1906 // same wave.
1907 LGKMCnt |= IsCrossAddrSpaceOrdering;
1908 break;
1909 case SIAtomicScope::WAVEFRONT:
1910 case SIAtomicScope::SINGLETHREAD:
1911 // The LDS keeps all memory operations in order for
1912 // the same wavefront.
1913 break;
1914 default:
1915 llvm_unreachable("Unsupported synchronization scope");
1916 }
1917 }
1918
1919 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1920 switch (Scope) {
1921 case SIAtomicScope::SYSTEM:
1922 case SIAtomicScope::AGENT:
1923 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1924 // is not needed as GDS operations for all waves are executed in a total
1925 // global ordering as observed by all waves. Required if also
1926 // synchronizing with global/LDS memory as GDS operations could be
1927 // reordered with respect to later global/LDS memory operations of the
1928 // same wave.
1929 LGKMCnt |= IsCrossAddrSpaceOrdering;
1930 break;
1931 case SIAtomicScope::WORKGROUP:
1932 case SIAtomicScope::WAVEFRONT:
1933 case SIAtomicScope::SINGLETHREAD:
1934 // The GDS keeps all memory operations in order for
1935 // the same work-group.
1936 break;
1937 default:
1938 llvm_unreachable("Unsupported synchronization scope");
1939 }
1940 }
1941
1942 if (VMCnt || LGKMCnt) {
1943 unsigned WaitCntImmediate =
1944 AMDGPU::encodeWaitcnt(IV,
1945 VMCnt ? 0 : getVmcntBitMask(IV),
1946 getExpcntBitMask(IV),
1947 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1948 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1949 Changed = true;
1950 }
1951
1952 if (VSCnt) {
1953 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1954 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1955 .addImm(0);
1956 Changed = true;
1957 }
1958
1959 if (Pos == Position::AFTER)
1960 --MI;
1961
1962 return Changed;
1963 }
1964
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1965 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1966 SIAtomicScope Scope,
1967 SIAtomicAddrSpace AddrSpace,
1968 Position Pos) const {
1969 if (!InsertCacheInv)
1970 return false;
1971
1972 bool Changed = false;
1973
1974 MachineBasicBlock &MBB = *MI->getParent();
1975 DebugLoc DL = MI->getDebugLoc();
1976
1977 if (Pos == Position::AFTER)
1978 ++MI;
1979
1980 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1981 switch (Scope) {
1982 case SIAtomicScope::SYSTEM:
1983 case SIAtomicScope::AGENT:
1984 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1985 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1986 Changed = true;
1987 break;
1988 case SIAtomicScope::WORKGROUP:
1989 // In WGP mode the waves of a work-group can be executing on either CU of
1990 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1991 // in CU mode and all waves of a work-group are on the same CU, and so the
1992 // L0 does not need to be invalidated.
1993 if (!ST.isCuModeEnabled()) {
1994 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1995 Changed = true;
1996 }
1997 break;
1998 case SIAtomicScope::WAVEFRONT:
1999 case SIAtomicScope::SINGLETHREAD:
2000 // No cache to invalidate.
2001 break;
2002 default:
2003 llvm_unreachable("Unsupported synchronization scope");
2004 }
2005 }
2006
2007 /// The scratch address space does not need the global memory cache
2008 /// to be flushed as all memory operations by the same thread are
2009 /// sequentially consistent, and no other thread can access scratch
2010 /// memory.
2011
2012 /// Other address spaces do not have a cache.
2013
2014 if (Pos == Position::AFTER)
2015 --MI;
2016
2017 return Changed;
2018 }
2019
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2020 bool SIGfx11CacheControl::enableLoadCacheBypass(
2021 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2022 SIAtomicAddrSpace AddrSpace) const {
2023 assert(MI->mayLoad() && !MI->mayStore());
2024 bool Changed = false;
2025
2026 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2027 switch (Scope) {
2028 case SIAtomicScope::SYSTEM:
2029 case SIAtomicScope::AGENT:
2030 // Set the L0 and L1 cache policies to MISS_EVICT.
2031 // Note: there is no L2 cache coherent bypass control at the ISA level.
2032 Changed |= enableGLCBit(MI);
2033 break;
2034 case SIAtomicScope::WORKGROUP:
2035 // In WGP mode the waves of a work-group can be executing on either CU of
2036 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2037 // CU mode all waves of a work-group are on the same CU, and so the L0
2038 // does not need to be bypassed.
2039 if (!ST.isCuModeEnabled())
2040 Changed |= enableGLCBit(MI);
2041 break;
2042 case SIAtomicScope::WAVEFRONT:
2043 case SIAtomicScope::SINGLETHREAD:
2044 // No cache to bypass.
2045 break;
2046 default:
2047 llvm_unreachable("Unsupported synchronization scope");
2048 }
2049 }
2050
2051 /// The scratch address space does not need the global memory caches
2052 /// to be bypassed as all memory operations by the same thread are
2053 /// sequentially consistent, and no other thread can access scratch
2054 /// memory.
2055
2056 /// Other address spaces do not have a cache.
2057
2058 return Changed;
2059 }
2060
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const2061 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2062 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2063 bool IsVolatile, bool IsNonTemporal) const {
2064
2065 // Only handle load and store, not atomic read-modify-write insructions. The
2066 // latter use glc to indicate if the atomic returns a result and so must not
2067 // be used for cache control.
2068 assert(MI->mayLoad() ^ MI->mayStore());
2069
2070 // Only update load and store, not LLVM IR atomic read-modify-write
2071 // instructions. The latter are always marked as volatile so cannot sensibly
2072 // handle it as do not want to pessimize all atomics. Also they do not support
2073 // the nontemporal attribute.
2074 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2075
2076 bool Changed = false;
2077
2078 if (IsVolatile) {
2079 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2080 // and MISS_LRU for store instructions.
2081 // Note: there is no L2 cache coherent bypass control at the ISA level.
2082 if (Op == SIMemOp::LOAD)
2083 Changed |= enableGLCBit(MI);
2084
2085 // Set MALL NOALLOC for load and store instructions.
2086 Changed |= enableDLCBit(MI);
2087
2088 // Ensure operation has completed at system scope to cause all volatile
2089 // operations to be visible outside the program in a global order. Do not
2090 // request cross address space as only the global address space can be
2091 // observable outside the program, so no need to cause a waitcnt for LDS
2092 // address space operations.
2093 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2094 Position::AFTER);
2095 return Changed;
2096 }
2097
2098 if (IsNonTemporal) {
2099 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2100 // and L2 cache policy to STREAM.
2101 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2102 // to MISS_EVICT and the L2 cache policy to STREAM.
2103 if (Op == SIMemOp::STORE)
2104 Changed |= enableGLCBit(MI);
2105 Changed |= enableSLCBit(MI);
2106
2107 // Set MALL NOALLOC for load and store instructions.
2108 Changed |= enableDLCBit(MI);
2109 return Changed;
2110 }
2111
2112 return Changed;
2113 }
2114
removeAtomicPseudoMIs()2115 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2116 if (AtomicPseudoMIs.empty())
2117 return false;
2118
2119 for (auto &MI : AtomicPseudoMIs)
2120 MI->eraseFromParent();
2121
2122 AtomicPseudoMIs.clear();
2123 return true;
2124 }
2125
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2126 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2127 MachineBasicBlock::iterator &MI) {
2128 assert(MI->mayLoad() && !MI->mayStore());
2129
2130 bool Changed = false;
2131
2132 if (MOI.isAtomic()) {
2133 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2134 MOI.getOrdering() == AtomicOrdering::Acquire ||
2135 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2136 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2137 MOI.getOrderingAddrSpace());
2138 }
2139
2140 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2141 Changed |= CC->insertWait(MI, MOI.getScope(),
2142 MOI.getOrderingAddrSpace(),
2143 SIMemOp::LOAD | SIMemOp::STORE,
2144 MOI.getIsCrossAddressSpaceOrdering(),
2145 Position::BEFORE);
2146
2147 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2148 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2149 Changed |= CC->insertWait(MI, MOI.getScope(),
2150 MOI.getInstrAddrSpace(),
2151 SIMemOp::LOAD,
2152 MOI.getIsCrossAddressSpaceOrdering(),
2153 Position::AFTER);
2154 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2155 MOI.getOrderingAddrSpace(),
2156 Position::AFTER);
2157 }
2158
2159 return Changed;
2160 }
2161
2162 // Atomic instructions already bypass caches to the scope specified by the
2163 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2164 // need additional treatment.
2165 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2166 SIMemOp::LOAD, MOI.isVolatile(),
2167 MOI.isNonTemporal());
2168 return Changed;
2169 }
2170
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2171 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2172 MachineBasicBlock::iterator &MI) {
2173 assert(!MI->mayLoad() && MI->mayStore());
2174
2175 bool Changed = false;
2176
2177 if (MOI.isAtomic()) {
2178 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2179 MOI.getOrdering() == AtomicOrdering::Release ||
2180 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2181 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2182 MOI.getOrderingAddrSpace());
2183 }
2184
2185 if (MOI.getOrdering() == AtomicOrdering::Release ||
2186 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2187 Changed |= CC->insertRelease(MI, MOI.getScope(),
2188 MOI.getOrderingAddrSpace(),
2189 MOI.getIsCrossAddressSpaceOrdering(),
2190 Position::BEFORE);
2191
2192 return Changed;
2193 }
2194
2195 // Atomic instructions already bypass caches to the scope specified by the
2196 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2197 // need additional treatment.
2198 Changed |= CC->enableVolatileAndOrNonTemporal(
2199 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2200 MOI.isNonTemporal());
2201 return Changed;
2202 }
2203
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2204 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2205 MachineBasicBlock::iterator &MI) {
2206 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2207
2208 AtomicPseudoMIs.push_back(MI);
2209 bool Changed = false;
2210
2211 if (MOI.isAtomic()) {
2212 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2213 MOI.getOrdering() == AtomicOrdering::Release ||
2214 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2215 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2216 /// TODO: This relies on a barrier always generating a waitcnt
2217 /// for LDS to ensure it is not reordered with the completion of
2218 /// the proceeding LDS operations. If barrier had a memory
2219 /// ordering and memory scope, then library does not need to
2220 /// generate a fence. Could add support in this file for
2221 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2222 /// adding S_WAITCNT before a S_BARRIER.
2223 Changed |= CC->insertRelease(MI, MOI.getScope(),
2224 MOI.getOrderingAddrSpace(),
2225 MOI.getIsCrossAddressSpaceOrdering(),
2226 Position::BEFORE);
2227
2228 // TODO: If both release and invalidate are happening they could be combined
2229 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2230 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2231 // track cache invalidate and write back instructions.
2232
2233 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2234 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2235 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2236 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2237 MOI.getOrderingAddrSpace(),
2238 Position::BEFORE);
2239
2240 return Changed;
2241 }
2242
2243 return Changed;
2244 }
2245
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2246 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2247 MachineBasicBlock::iterator &MI) {
2248 assert(MI->mayLoad() && MI->mayStore());
2249
2250 bool Changed = false;
2251
2252 if (MOI.isAtomic()) {
2253 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2254 MOI.getOrdering() == AtomicOrdering::Acquire ||
2255 MOI.getOrdering() == AtomicOrdering::Release ||
2256 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2257 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2258 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2259 MOI.getInstrAddrSpace());
2260 }
2261
2262 if (MOI.getOrdering() == AtomicOrdering::Release ||
2263 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2264 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2265 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2266 Changed |= CC->insertRelease(MI, MOI.getScope(),
2267 MOI.getOrderingAddrSpace(),
2268 MOI.getIsCrossAddressSpaceOrdering(),
2269 Position::BEFORE);
2270
2271 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2272 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2273 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2274 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2275 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2276 Changed |= CC->insertWait(MI, MOI.getScope(),
2277 MOI.getInstrAddrSpace(),
2278 isAtomicRet(*MI) ? SIMemOp::LOAD :
2279 SIMemOp::STORE,
2280 MOI.getIsCrossAddressSpaceOrdering(),
2281 Position::AFTER);
2282 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2283 MOI.getOrderingAddrSpace(),
2284 Position::AFTER);
2285 }
2286
2287 return Changed;
2288 }
2289
2290 return Changed;
2291 }
2292
runOnMachineFunction(MachineFunction & MF)2293 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2294 bool Changed = false;
2295
2296 SIMemOpAccess MOA(MF);
2297 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2298
2299 for (auto &MBB : MF) {
2300 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2301
2302 // Unbundle instructions after the post-RA scheduler.
2303 if (MI->isBundle() && MI->mayLoadOrStore()) {
2304 MachineBasicBlock::instr_iterator II(MI->getIterator());
2305 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2306 I != E && I->isBundledWithPred(); ++I) {
2307 I->unbundleFromPred();
2308 for (MachineOperand &MO : I->operands())
2309 if (MO.isReg())
2310 MO.setIsInternalRead(false);
2311 }
2312
2313 MI->eraseFromParent();
2314 MI = II->getIterator();
2315 }
2316
2317 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2318 continue;
2319
2320 if (const auto &MOI = MOA.getLoadInfo(MI))
2321 Changed |= expandLoad(*MOI, MI);
2322 else if (const auto &MOI = MOA.getStoreInfo(MI))
2323 Changed |= expandStore(*MOI, MI);
2324 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2325 Changed |= expandAtomicFence(*MOI, MI);
2326 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2327 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2328 }
2329 }
2330
2331 Changed |= removeAtomicPseudoMIs();
2332 return Changed;
2333 }
2334
2335 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2336
2337 char SIMemoryLegalizer::ID = 0;
2338 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2339
createSIMemoryLegalizerPass()2340 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2341 return new SIMemoryLegalizer();
2342 }
2343