1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/TargetParser/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { 316 return false; 317 }; 318 319 /// Inserts any necessary instructions at position \p Pos relative 320 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 321 /// \p Op associated with address spaces \p AddrSpace have completed. Used 322 /// between memory instructions to enforce the order they become visible as 323 /// observed by other memory instructions executing in memory scope \p Scope. 324 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 325 /// address spaces. Returns true iff any instructions inserted. 326 virtual bool insertWait(MachineBasicBlock::iterator &MI, 327 SIAtomicScope Scope, 328 SIAtomicAddrSpace AddrSpace, 329 SIMemOp Op, 330 bool IsCrossAddrSpaceOrdering, 331 Position Pos) const = 0; 332 333 /// Inserts any necessary instructions at position \p Pos relative to 334 /// instruction \p MI to ensure any subsequent memory instructions of this 335 /// thread with address spaces \p AddrSpace will observe the previous memory 336 /// operations by any thread for memory scopes up to memory scope \p Scope . 337 /// Returns true iff any instructions inserted. 338 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 339 SIAtomicScope Scope, 340 SIAtomicAddrSpace AddrSpace, 341 Position Pos) const = 0; 342 343 /// Inserts any necessary instructions at position \p Pos relative to 344 /// instruction \p MI to ensure previous memory instructions by this thread 345 /// with address spaces \p AddrSpace have completed and can be observed by 346 /// subsequent memory instructions by any thread executing in memory scope \p 347 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 348 /// between address spaces. Returns true iff any instructions inserted. 349 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 350 SIAtomicScope Scope, 351 SIAtomicAddrSpace AddrSpace, 352 bool IsCrossAddrSpaceOrdering, 353 Position Pos) const = 0; 354 355 /// Virtual destructor to allow derivations to be deleted. 356 virtual ~SICacheControl() = default; 357 358 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 359 MachineBasicBlock::iterator &MI) const { 360 return false; 361 } 362 }; 363 364 class SIGfx6CacheControl : public SICacheControl { 365 protected: 366 367 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 368 /// is modified, false otherwise. 369 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 370 return enableNamedBit(MI, AMDGPU::CPol::GLC); 371 } 372 373 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 374 /// is modified, false otherwise. 375 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 376 return enableNamedBit(MI, AMDGPU::CPol::SLC); 377 } 378 379 public: 380 381 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 382 383 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace) const override; 390 391 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 392 SIAtomicScope Scope, 393 SIAtomicAddrSpace AddrSpace) const override; 394 395 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 396 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 397 bool IsVolatile, 398 bool IsNonTemporal) const override; 399 400 bool insertWait(MachineBasicBlock::iterator &MI, 401 SIAtomicScope Scope, 402 SIAtomicAddrSpace AddrSpace, 403 SIMemOp Op, 404 bool IsCrossAddrSpaceOrdering, 405 Position Pos) const override; 406 407 bool insertAcquire(MachineBasicBlock::iterator &MI, 408 SIAtomicScope Scope, 409 SIAtomicAddrSpace AddrSpace, 410 Position Pos) const override; 411 412 bool insertRelease(MachineBasicBlock::iterator &MI, 413 SIAtomicScope Scope, 414 SIAtomicAddrSpace AddrSpace, 415 bool IsCrossAddrSpaceOrdering, 416 Position Pos) const override; 417 }; 418 419 class SIGfx7CacheControl : public SIGfx6CacheControl { 420 public: 421 422 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 423 424 bool insertAcquire(MachineBasicBlock::iterator &MI, 425 SIAtomicScope Scope, 426 SIAtomicAddrSpace AddrSpace, 427 Position Pos) const override; 428 429 }; 430 431 class SIGfx90ACacheControl : public SIGfx7CacheControl { 432 public: 433 434 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 435 436 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace) const override; 443 444 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 445 SIAtomicScope Scope, 446 SIAtomicAddrSpace AddrSpace) const override; 447 448 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 449 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 450 bool IsVolatile, 451 bool IsNonTemporal) const override; 452 453 bool insertWait(MachineBasicBlock::iterator &MI, 454 SIAtomicScope Scope, 455 SIAtomicAddrSpace AddrSpace, 456 SIMemOp Op, 457 bool IsCrossAddrSpaceOrdering, 458 Position Pos) const override; 459 460 bool insertAcquire(MachineBasicBlock::iterator &MI, 461 SIAtomicScope Scope, 462 SIAtomicAddrSpace AddrSpace, 463 Position Pos) const override; 464 465 bool insertRelease(MachineBasicBlock::iterator &MI, 466 SIAtomicScope Scope, 467 SIAtomicAddrSpace AddrSpace, 468 bool IsCrossAddrSpaceOrdering, 469 Position Pos) const override; 470 }; 471 472 class SIGfx940CacheControl : public SIGfx90ACacheControl { 473 protected: 474 475 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 476 /// is modified, false otherwise. 477 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 478 return enableNamedBit(MI, AMDGPU::CPol::SC0); 479 } 480 481 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 482 /// is modified, false otherwise. 483 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 484 return enableNamedBit(MI, AMDGPU::CPol::SC1); 485 } 486 487 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 488 /// is modified, false otherwise. 489 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 490 return enableNamedBit(MI, AMDGPU::CPol::NT); 491 } 492 493 public: 494 495 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 496 497 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 502 SIAtomicScope Scope, 503 SIAtomicAddrSpace AddrSpace) const override; 504 505 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 506 SIAtomicScope Scope, 507 SIAtomicAddrSpace AddrSpace) const override; 508 509 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 510 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 511 bool IsVolatile, 512 bool IsNonTemporal) const override; 513 514 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 515 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 516 517 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 518 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 519 Position Pos) const override; 520 521 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 522 MachineBasicBlock::iterator &MI) const override { 523 bool Changed = false; 524 if (ST.hasForceStoreSC0SC1() && 525 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 526 SIAtomicAddrSpace::GLOBAL | 527 SIAtomicAddrSpace::OTHER)) != 528 SIAtomicAddrSpace::NONE) { 529 Changed |= enableSC0Bit(MI); 530 Changed |= enableSC1Bit(MI); 531 } 532 return Changed; 533 } 534 }; 535 536 class SIGfx10CacheControl : public SIGfx7CacheControl { 537 protected: 538 539 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 540 /// is modified, false otherwise. 541 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 542 return enableNamedBit(MI, AMDGPU::CPol::DLC); 543 } 544 545 public: 546 547 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 548 549 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 550 SIAtomicScope Scope, 551 SIAtomicAddrSpace AddrSpace) const override; 552 553 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 554 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 555 bool IsVolatile, 556 bool IsNonTemporal) const override; 557 558 bool insertWait(MachineBasicBlock::iterator &MI, 559 SIAtomicScope Scope, 560 SIAtomicAddrSpace AddrSpace, 561 SIMemOp Op, 562 bool IsCrossAddrSpaceOrdering, 563 Position Pos) const override; 564 565 bool insertAcquire(MachineBasicBlock::iterator &MI, 566 SIAtomicScope Scope, 567 SIAtomicAddrSpace AddrSpace, 568 Position Pos) const override; 569 }; 570 571 class SIGfx11CacheControl : public SIGfx10CacheControl { 572 public: 573 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 574 575 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 576 SIAtomicScope Scope, 577 SIAtomicAddrSpace AddrSpace) const override; 578 579 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 580 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 581 bool IsVolatile, 582 bool IsNonTemporal) const override; 583 }; 584 585 class SIGfx12CacheControl : public SIGfx11CacheControl { 586 protected: 587 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 588 // \returns Returns true if \p MI is modified, false otherwise. 589 bool setTH(const MachineBasicBlock::iterator MI, 590 AMDGPU::CPol::CPol Value) const; 591 // Sets Scope policy to \p Value if CPol operand is present in instruction \p 592 // MI. \returns Returns true if \p MI is modified, false otherwise. 593 bool setScope(const MachineBasicBlock::iterator MI, 594 AMDGPU::CPol::CPol Value) const; 595 596 // Stores with system scope (SCOPE_SYS) need to wait for: 597 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 598 // - non-returning-atomics - wait for STORECNT==0 599 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits 600 // since it does not distinguish atomics-with-return from regular stores. 601 // There is no need to wait if memory is cached (mtype != UC). 602 bool 603 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; 604 605 public: 606 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 607 608 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 609 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 610 bool IsCrossAddrSpaceOrdering, Position Pos) const override; 611 612 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 613 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 614 615 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 616 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 617 bool IsVolatile, 618 bool IsNonTemporal) const override; 619 620 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; 621 }; 622 623 class SIMemoryLegalizer final : public MachineFunctionPass { 624 private: 625 626 /// Cache Control. 627 std::unique_ptr<SICacheControl> CC = nullptr; 628 629 /// List of atomic pseudo instructions. 630 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 631 632 /// Return true iff instruction \p MI is a atomic instruction that 633 /// returns a result. 634 bool isAtomicRet(const MachineInstr &MI) const { 635 return SIInstrInfo::isAtomicRet(MI); 636 } 637 638 /// Removes all processed atomic pseudo instructions from the current 639 /// function. Returns true if current function is modified, false otherwise. 640 bool removeAtomicPseudoMIs(); 641 642 /// Expands load operation \p MI. Returns true if instructions are 643 /// added/deleted or \p MI is modified, false otherwise. 644 bool expandLoad(const SIMemOpInfo &MOI, 645 MachineBasicBlock::iterator &MI); 646 /// Expands store operation \p MI. Returns true if instructions are 647 /// added/deleted or \p MI is modified, false otherwise. 648 bool expandStore(const SIMemOpInfo &MOI, 649 MachineBasicBlock::iterator &MI); 650 /// Expands atomic fence operation \p MI. Returns true if 651 /// instructions are added/deleted or \p MI is modified, false otherwise. 652 bool expandAtomicFence(const SIMemOpInfo &MOI, 653 MachineBasicBlock::iterator &MI); 654 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 655 /// instructions are added/deleted or \p MI is modified, false otherwise. 656 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 657 MachineBasicBlock::iterator &MI); 658 659 public: 660 static char ID; 661 662 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 663 664 void getAnalysisUsage(AnalysisUsage &AU) const override { 665 AU.setPreservesCFG(); 666 MachineFunctionPass::getAnalysisUsage(AU); 667 } 668 669 StringRef getPassName() const override { 670 return PASS_NAME; 671 } 672 673 bool runOnMachineFunction(MachineFunction &MF) override; 674 }; 675 676 } // end namespace anonymous 677 678 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 679 const char *Msg) const { 680 const Function &Func = MI->getParent()->getParent()->getFunction(); 681 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 682 Func.getContext().diagnose(Diag); 683 } 684 685 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 686 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 687 SIAtomicAddrSpace InstrAddrSpace) const { 688 if (SSID == SyncScope::System) 689 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 690 if (SSID == MMI->getAgentSSID()) 691 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 692 if (SSID == MMI->getWorkgroupSSID()) 693 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 694 true); 695 if (SSID == MMI->getWavefrontSSID()) 696 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 697 true); 698 if (SSID == SyncScope::SingleThread) 699 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 700 true); 701 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 702 return std::tuple(SIAtomicScope::SYSTEM, 703 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 704 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 705 return std::tuple(SIAtomicScope::AGENT, 706 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 707 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 708 return std::tuple(SIAtomicScope::WORKGROUP, 709 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 710 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 711 return std::tuple(SIAtomicScope::WAVEFRONT, 712 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 713 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 714 return std::tuple(SIAtomicScope::SINGLETHREAD, 715 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 716 return std::nullopt; 717 } 718 719 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 720 if (AS == AMDGPUAS::FLAT_ADDRESS) 721 return SIAtomicAddrSpace::FLAT; 722 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 723 return SIAtomicAddrSpace::GLOBAL; 724 if (AS == AMDGPUAS::LOCAL_ADDRESS) 725 return SIAtomicAddrSpace::LDS; 726 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 727 return SIAtomicAddrSpace::SCRATCH; 728 if (AS == AMDGPUAS::REGION_ADDRESS) 729 return SIAtomicAddrSpace::GDS; 730 731 return SIAtomicAddrSpace::OTHER; 732 } 733 734 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 735 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 736 } 737 738 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 739 const MachineBasicBlock::iterator &MI) const { 740 assert(MI->getNumMemOperands() > 0); 741 742 SyncScope::ID SSID = SyncScope::SingleThread; 743 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 744 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 745 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 746 bool IsNonTemporal = true; 747 bool IsVolatile = false; 748 749 // Validator should check whether or not MMOs cover the entire set of 750 // locations accessed by the memory instruction. 751 for (const auto &MMO : MI->memoperands()) { 752 IsNonTemporal &= MMO->isNonTemporal(); 753 IsVolatile |= MMO->isVolatile(); 754 InstrAddrSpace |= 755 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 756 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 757 if (OpOrdering != AtomicOrdering::NotAtomic) { 758 const auto &IsSyncScopeInclusion = 759 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 760 if (!IsSyncScopeInclusion) { 761 reportUnsupported(MI, 762 "Unsupported non-inclusive atomic synchronization scope"); 763 return std::nullopt; 764 } 765 766 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 767 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 768 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 769 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 770 FailureOrdering = 771 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 772 } 773 } 774 775 SIAtomicScope Scope = SIAtomicScope::NONE; 776 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 777 bool IsCrossAddressSpaceOrdering = false; 778 if (Ordering != AtomicOrdering::NotAtomic) { 779 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 780 if (!ScopeOrNone) { 781 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 782 return std::nullopt; 783 } 784 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 785 *ScopeOrNone; 786 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 787 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 788 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 789 reportUnsupported(MI, "Unsupported atomic address space"); 790 return std::nullopt; 791 } 792 } 793 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 794 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 795 IsNonTemporal); 796 } 797 798 std::optional<SIMemOpInfo> 799 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 800 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 801 802 if (!(MI->mayLoad() && !MI->mayStore())) 803 return std::nullopt; 804 805 // Be conservative if there are no memory operands. 806 if (MI->getNumMemOperands() == 0) 807 return SIMemOpInfo(); 808 809 return constructFromMIWithMMO(MI); 810 } 811 812 std::optional<SIMemOpInfo> 813 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 814 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 815 816 if (!(!MI->mayLoad() && MI->mayStore())) 817 return std::nullopt; 818 819 // Be conservative if there are no memory operands. 820 if (MI->getNumMemOperands() == 0) 821 return SIMemOpInfo(); 822 823 return constructFromMIWithMMO(MI); 824 } 825 826 std::optional<SIMemOpInfo> 827 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 828 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 829 830 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 831 return std::nullopt; 832 833 AtomicOrdering Ordering = 834 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 835 836 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 837 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 838 if (!ScopeOrNone) { 839 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 840 return std::nullopt; 841 } 842 843 SIAtomicScope Scope = SIAtomicScope::NONE; 844 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 845 bool IsCrossAddressSpaceOrdering = false; 846 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 847 *ScopeOrNone; 848 849 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 850 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 851 reportUnsupported(MI, "Unsupported atomic address space"); 852 return std::nullopt; 853 } 854 855 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 856 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 857 } 858 859 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 860 const MachineBasicBlock::iterator &MI) const { 861 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 862 863 if (!(MI->mayLoad() && MI->mayStore())) 864 return std::nullopt; 865 866 // Be conservative if there are no memory operands. 867 if (MI->getNumMemOperands() == 0) 868 return SIMemOpInfo(); 869 870 return constructFromMIWithMMO(MI); 871 } 872 873 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 874 TII = ST.getInstrInfo(); 875 IV = getIsaVersion(ST.getCPU()); 876 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 877 } 878 879 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 880 AMDGPU::CPol::CPol Bit) const { 881 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 882 if (!CPol) 883 return false; 884 885 CPol->setImm(CPol->getImm() | Bit); 886 return true; 887 } 888 889 /* static */ 890 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 891 GCNSubtarget::Generation Generation = ST.getGeneration(); 892 if (ST.hasGFX940Insts()) 893 return std::make_unique<SIGfx940CacheControl>(ST); 894 if (ST.hasGFX90AInsts()) 895 return std::make_unique<SIGfx90ACacheControl>(ST); 896 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 897 return std::make_unique<SIGfx6CacheControl>(ST); 898 if (Generation < AMDGPUSubtarget::GFX10) 899 return std::make_unique<SIGfx7CacheControl>(ST); 900 if (Generation < AMDGPUSubtarget::GFX11) 901 return std::make_unique<SIGfx10CacheControl>(ST); 902 if (Generation < AMDGPUSubtarget::GFX12) 903 return std::make_unique<SIGfx11CacheControl>(ST); 904 return std::make_unique<SIGfx12CacheControl>(ST); 905 } 906 907 bool SIGfx6CacheControl::enableLoadCacheBypass( 908 const MachineBasicBlock::iterator &MI, 909 SIAtomicScope Scope, 910 SIAtomicAddrSpace AddrSpace) const { 911 assert(MI->mayLoad() && !MI->mayStore()); 912 bool Changed = false; 913 914 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 915 switch (Scope) { 916 case SIAtomicScope::SYSTEM: 917 case SIAtomicScope::AGENT: 918 // Set L1 cache policy to MISS_EVICT. 919 // Note: there is no L2 cache bypass policy at the ISA level. 920 Changed |= enableGLCBit(MI); 921 break; 922 case SIAtomicScope::WORKGROUP: 923 case SIAtomicScope::WAVEFRONT: 924 case SIAtomicScope::SINGLETHREAD: 925 // No cache to bypass. 926 break; 927 default: 928 llvm_unreachable("Unsupported synchronization scope"); 929 } 930 } 931 932 /// The scratch address space does not need the global memory caches 933 /// to be bypassed as all memory operations by the same thread are 934 /// sequentially consistent, and no other thread can access scratch 935 /// memory. 936 937 /// Other address spaces do not have a cache. 938 939 return Changed; 940 } 941 942 bool SIGfx6CacheControl::enableStoreCacheBypass( 943 const MachineBasicBlock::iterator &MI, 944 SIAtomicScope Scope, 945 SIAtomicAddrSpace AddrSpace) const { 946 assert(!MI->mayLoad() && MI->mayStore()); 947 bool Changed = false; 948 949 /// The L1 cache is write through so does not need to be bypassed. There is no 950 /// bypass control for the L2 cache at the isa level. 951 952 return Changed; 953 } 954 955 bool SIGfx6CacheControl::enableRMWCacheBypass( 956 const MachineBasicBlock::iterator &MI, 957 SIAtomicScope Scope, 958 SIAtomicAddrSpace AddrSpace) const { 959 assert(MI->mayLoad() && MI->mayStore()); 960 bool Changed = false; 961 962 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 963 /// bypassed, and the GLC bit is instead used to indicate if they are 964 /// return or no-return. 965 /// Note: there is no L2 cache coherent bypass control at the ISA level. 966 967 return Changed; 968 } 969 970 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 971 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 972 bool IsVolatile, bool IsNonTemporal) const { 973 // Only handle load and store, not atomic read-modify-write insructions. The 974 // latter use glc to indicate if the atomic returns a result and so must not 975 // be used for cache control. 976 assert(MI->mayLoad() ^ MI->mayStore()); 977 978 // Only update load and store, not LLVM IR atomic read-modify-write 979 // instructions. The latter are always marked as volatile so cannot sensibly 980 // handle it as do not want to pessimize all atomics. Also they do not support 981 // the nontemporal attribute. 982 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 983 984 bool Changed = false; 985 986 if (IsVolatile) { 987 // Set L1 cache policy to be MISS_EVICT for load instructions 988 // and MISS_LRU for store instructions. 989 // Note: there is no L2 cache bypass policy at the ISA level. 990 if (Op == SIMemOp::LOAD) 991 Changed |= enableGLCBit(MI); 992 993 // Ensure operation has completed at system scope to cause all volatile 994 // operations to be visible outside the program in a global order. Do not 995 // request cross address space as only the global address space can be 996 // observable outside the program, so no need to cause a waitcnt for LDS 997 // address space operations. 998 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 999 Position::AFTER); 1000 1001 return Changed; 1002 } 1003 1004 if (IsNonTemporal) { 1005 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1006 // for both loads and stores, and the L2 cache policy to STREAM. 1007 Changed |= enableGLCBit(MI); 1008 Changed |= enableSLCBit(MI); 1009 return Changed; 1010 } 1011 1012 return Changed; 1013 } 1014 1015 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1016 SIAtomicScope Scope, 1017 SIAtomicAddrSpace AddrSpace, 1018 SIMemOp Op, 1019 bool IsCrossAddrSpaceOrdering, 1020 Position Pos) const { 1021 bool Changed = false; 1022 1023 MachineBasicBlock &MBB = *MI->getParent(); 1024 DebugLoc DL = MI->getDebugLoc(); 1025 1026 if (Pos == Position::AFTER) 1027 ++MI; 1028 1029 bool VMCnt = false; 1030 bool LGKMCnt = false; 1031 1032 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1033 SIAtomicAddrSpace::NONE) { 1034 switch (Scope) { 1035 case SIAtomicScope::SYSTEM: 1036 case SIAtomicScope::AGENT: 1037 VMCnt |= true; 1038 break; 1039 case SIAtomicScope::WORKGROUP: 1040 case SIAtomicScope::WAVEFRONT: 1041 case SIAtomicScope::SINGLETHREAD: 1042 // The L1 cache keeps all memory operations in order for 1043 // wavefronts in the same work-group. 1044 break; 1045 default: 1046 llvm_unreachable("Unsupported synchronization scope"); 1047 } 1048 } 1049 1050 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1051 switch (Scope) { 1052 case SIAtomicScope::SYSTEM: 1053 case SIAtomicScope::AGENT: 1054 case SIAtomicScope::WORKGROUP: 1055 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1056 // not needed as LDS operations for all waves are executed in a total 1057 // global ordering as observed by all waves. Required if also 1058 // synchronizing with global/GDS memory as LDS operations could be 1059 // reordered with respect to later global/GDS memory operations of the 1060 // same wave. 1061 LGKMCnt |= IsCrossAddrSpaceOrdering; 1062 break; 1063 case SIAtomicScope::WAVEFRONT: 1064 case SIAtomicScope::SINGLETHREAD: 1065 // The LDS keeps all memory operations in order for 1066 // the same wavefront. 1067 break; 1068 default: 1069 llvm_unreachable("Unsupported synchronization scope"); 1070 } 1071 } 1072 1073 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1074 switch (Scope) { 1075 case SIAtomicScope::SYSTEM: 1076 case SIAtomicScope::AGENT: 1077 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1078 // is not needed as GDS operations for all waves are executed in a total 1079 // global ordering as observed by all waves. Required if also 1080 // synchronizing with global/LDS memory as GDS operations could be 1081 // reordered with respect to later global/LDS memory operations of the 1082 // same wave. 1083 LGKMCnt |= IsCrossAddrSpaceOrdering; 1084 break; 1085 case SIAtomicScope::WORKGROUP: 1086 case SIAtomicScope::WAVEFRONT: 1087 case SIAtomicScope::SINGLETHREAD: 1088 // The GDS keeps all memory operations in order for 1089 // the same work-group. 1090 break; 1091 default: 1092 llvm_unreachable("Unsupported synchronization scope"); 1093 } 1094 } 1095 1096 if (VMCnt || LGKMCnt) { 1097 unsigned WaitCntImmediate = 1098 AMDGPU::encodeWaitcnt(IV, 1099 VMCnt ? 0 : getVmcntBitMask(IV), 1100 getExpcntBitMask(IV), 1101 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1102 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1103 .addImm(WaitCntImmediate); 1104 Changed = true; 1105 } 1106 1107 if (Pos == Position::AFTER) 1108 --MI; 1109 1110 return Changed; 1111 } 1112 1113 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1114 SIAtomicScope Scope, 1115 SIAtomicAddrSpace AddrSpace, 1116 Position Pos) const { 1117 if (!InsertCacheInv) 1118 return false; 1119 1120 bool Changed = false; 1121 1122 MachineBasicBlock &MBB = *MI->getParent(); 1123 DebugLoc DL = MI->getDebugLoc(); 1124 1125 if (Pos == Position::AFTER) 1126 ++MI; 1127 1128 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1129 switch (Scope) { 1130 case SIAtomicScope::SYSTEM: 1131 case SIAtomicScope::AGENT: 1132 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1133 Changed = true; 1134 break; 1135 case SIAtomicScope::WORKGROUP: 1136 case SIAtomicScope::WAVEFRONT: 1137 case SIAtomicScope::SINGLETHREAD: 1138 // No cache to invalidate. 1139 break; 1140 default: 1141 llvm_unreachable("Unsupported synchronization scope"); 1142 } 1143 } 1144 1145 /// The scratch address space does not need the global memory cache 1146 /// to be flushed as all memory operations by the same thread are 1147 /// sequentially consistent, and no other thread can access scratch 1148 /// memory. 1149 1150 /// Other address spaces do not have a cache. 1151 1152 if (Pos == Position::AFTER) 1153 --MI; 1154 1155 return Changed; 1156 } 1157 1158 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1159 SIAtomicScope Scope, 1160 SIAtomicAddrSpace AddrSpace, 1161 bool IsCrossAddrSpaceOrdering, 1162 Position Pos) const { 1163 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1164 IsCrossAddrSpaceOrdering, Pos); 1165 } 1166 1167 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1168 SIAtomicScope Scope, 1169 SIAtomicAddrSpace AddrSpace, 1170 Position Pos) const { 1171 if (!InsertCacheInv) 1172 return false; 1173 1174 bool Changed = false; 1175 1176 MachineBasicBlock &MBB = *MI->getParent(); 1177 DebugLoc DL = MI->getDebugLoc(); 1178 1179 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1180 1181 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1182 ? AMDGPU::BUFFER_WBINVL1 1183 : AMDGPU::BUFFER_WBINVL1_VOL; 1184 1185 if (Pos == Position::AFTER) 1186 ++MI; 1187 1188 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1189 switch (Scope) { 1190 case SIAtomicScope::SYSTEM: 1191 case SIAtomicScope::AGENT: 1192 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1193 Changed = true; 1194 break; 1195 case SIAtomicScope::WORKGROUP: 1196 case SIAtomicScope::WAVEFRONT: 1197 case SIAtomicScope::SINGLETHREAD: 1198 // No cache to invalidate. 1199 break; 1200 default: 1201 llvm_unreachable("Unsupported synchronization scope"); 1202 } 1203 } 1204 1205 /// The scratch address space does not need the global memory cache 1206 /// to be flushed as all memory operations by the same thread are 1207 /// sequentially consistent, and no other thread can access scratch 1208 /// memory. 1209 1210 /// Other address spaces do not have a cache. 1211 1212 if (Pos == Position::AFTER) 1213 --MI; 1214 1215 return Changed; 1216 } 1217 1218 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1219 const MachineBasicBlock::iterator &MI, 1220 SIAtomicScope Scope, 1221 SIAtomicAddrSpace AddrSpace) const { 1222 assert(MI->mayLoad() && !MI->mayStore()); 1223 bool Changed = false; 1224 1225 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1226 switch (Scope) { 1227 case SIAtomicScope::SYSTEM: 1228 case SIAtomicScope::AGENT: 1229 // Set the L1 cache policy to MISS_LRU. 1230 // Note: there is no L2 cache bypass policy at the ISA level. 1231 Changed |= enableGLCBit(MI); 1232 break; 1233 case SIAtomicScope::WORKGROUP: 1234 // In threadgroup split mode the waves of a work-group can be executing on 1235 // different CUs. Therefore need to bypass the L1 which is per CU. 1236 // Otherwise in non-threadgroup split mode all waves of a work-group are 1237 // on the same CU, and so the L1 does not need to be bypassed. 1238 if (ST.isTgSplitEnabled()) 1239 Changed |= enableGLCBit(MI); 1240 break; 1241 case SIAtomicScope::WAVEFRONT: 1242 case SIAtomicScope::SINGLETHREAD: 1243 // No cache to bypass. 1244 break; 1245 default: 1246 llvm_unreachable("Unsupported synchronization scope"); 1247 } 1248 } 1249 1250 /// The scratch address space does not need the global memory caches 1251 /// to be bypassed as all memory operations by the same thread are 1252 /// sequentially consistent, and no other thread can access scratch 1253 /// memory. 1254 1255 /// Other address spaces do not have a cache. 1256 1257 return Changed; 1258 } 1259 1260 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1261 const MachineBasicBlock::iterator &MI, 1262 SIAtomicScope Scope, 1263 SIAtomicAddrSpace AddrSpace) const { 1264 assert(!MI->mayLoad() && MI->mayStore()); 1265 bool Changed = false; 1266 1267 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1268 switch (Scope) { 1269 case SIAtomicScope::SYSTEM: 1270 case SIAtomicScope::AGENT: 1271 /// Do not set glc for store atomic operations as they implicitly write 1272 /// through the L1 cache. 1273 break; 1274 case SIAtomicScope::WORKGROUP: 1275 case SIAtomicScope::WAVEFRONT: 1276 case SIAtomicScope::SINGLETHREAD: 1277 // No cache to bypass. Store atomics implicitly write through the L1 1278 // cache. 1279 break; 1280 default: 1281 llvm_unreachable("Unsupported synchronization scope"); 1282 } 1283 } 1284 1285 /// The scratch address space does not need the global memory caches 1286 /// to be bypassed as all memory operations by the same thread are 1287 /// sequentially consistent, and no other thread can access scratch 1288 /// memory. 1289 1290 /// Other address spaces do not have a cache. 1291 1292 return Changed; 1293 } 1294 1295 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1296 const MachineBasicBlock::iterator &MI, 1297 SIAtomicScope Scope, 1298 SIAtomicAddrSpace AddrSpace) const { 1299 assert(MI->mayLoad() && MI->mayStore()); 1300 bool Changed = false; 1301 1302 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1303 switch (Scope) { 1304 case SIAtomicScope::SYSTEM: 1305 case SIAtomicScope::AGENT: 1306 /// Do not set glc for RMW atomic operations as they implicitly bypass 1307 /// the L1 cache, and the glc bit is instead used to indicate if they are 1308 /// return or no-return. 1309 break; 1310 case SIAtomicScope::WORKGROUP: 1311 case SIAtomicScope::WAVEFRONT: 1312 case SIAtomicScope::SINGLETHREAD: 1313 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1314 break; 1315 default: 1316 llvm_unreachable("Unsupported synchronization scope"); 1317 } 1318 } 1319 1320 return Changed; 1321 } 1322 1323 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1324 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1325 bool IsVolatile, bool IsNonTemporal) const { 1326 // Only handle load and store, not atomic read-modify-write insructions. The 1327 // latter use glc to indicate if the atomic returns a result and so must not 1328 // be used for cache control. 1329 assert(MI->mayLoad() ^ MI->mayStore()); 1330 1331 // Only update load and store, not LLVM IR atomic read-modify-write 1332 // instructions. The latter are always marked as volatile so cannot sensibly 1333 // handle it as do not want to pessimize all atomics. Also they do not support 1334 // the nontemporal attribute. 1335 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1336 1337 bool Changed = false; 1338 1339 if (IsVolatile) { 1340 // Set L1 cache policy to be MISS_EVICT for load instructions 1341 // and MISS_LRU for store instructions. 1342 // Note: there is no L2 cache bypass policy at the ISA level. 1343 if (Op == SIMemOp::LOAD) 1344 Changed |= enableGLCBit(MI); 1345 1346 // Ensure operation has completed at system scope to cause all volatile 1347 // operations to be visible outside the program in a global order. Do not 1348 // request cross address space as only the global address space can be 1349 // observable outside the program, so no need to cause a waitcnt for LDS 1350 // address space operations. 1351 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1352 Position::AFTER); 1353 1354 return Changed; 1355 } 1356 1357 if (IsNonTemporal) { 1358 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1359 // for both loads and stores, and the L2 cache policy to STREAM. 1360 Changed |= enableGLCBit(MI); 1361 Changed |= enableSLCBit(MI); 1362 return Changed; 1363 } 1364 1365 return Changed; 1366 } 1367 1368 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1369 SIAtomicScope Scope, 1370 SIAtomicAddrSpace AddrSpace, 1371 SIMemOp Op, 1372 bool IsCrossAddrSpaceOrdering, 1373 Position Pos) const { 1374 if (ST.isTgSplitEnabled()) { 1375 // In threadgroup split mode the waves of a work-group can be executing on 1376 // different CUs. Therefore need to wait for global or GDS memory operations 1377 // to complete to ensure they are visible to waves in the other CUs. 1378 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1379 // the same CU, so no need to wait for global memory as all waves in the 1380 // work-group access the same the L1, nor wait for GDS as access are ordered 1381 // on a CU. 1382 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1383 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1384 (Scope == SIAtomicScope::WORKGROUP)) { 1385 // Same as GFX7 using agent scope. 1386 Scope = SIAtomicScope::AGENT; 1387 } 1388 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1389 // LDS memory operations. 1390 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1391 } 1392 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1393 IsCrossAddrSpaceOrdering, Pos); 1394 } 1395 1396 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1397 SIAtomicScope Scope, 1398 SIAtomicAddrSpace AddrSpace, 1399 Position Pos) const { 1400 if (!InsertCacheInv) 1401 return false; 1402 1403 bool Changed = false; 1404 1405 MachineBasicBlock &MBB = *MI->getParent(); 1406 DebugLoc DL = MI->getDebugLoc(); 1407 1408 if (Pos == Position::AFTER) 1409 ++MI; 1410 1411 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1412 switch (Scope) { 1413 case SIAtomicScope::SYSTEM: 1414 // Ensures that following loads will not see stale remote VMEM data or 1415 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1416 // CC will never be stale due to the local memory probes. 1417 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1418 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1419 // hardware does not reorder memory operations by the same wave with 1420 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1421 // remove any cache lines of earlier writes by the same wave and ensures 1422 // later reads by the same wave will refetch the cache lines. 1423 Changed = true; 1424 break; 1425 case SIAtomicScope::AGENT: 1426 // Same as GFX7. 1427 break; 1428 case SIAtomicScope::WORKGROUP: 1429 // In threadgroup split mode the waves of a work-group can be executing on 1430 // different CUs. Therefore need to invalidate the L1 which is per CU. 1431 // Otherwise in non-threadgroup split mode all waves of a work-group are 1432 // on the same CU, and so the L1 does not need to be invalidated. 1433 if (ST.isTgSplitEnabled()) { 1434 // Same as GFX7 using agent scope. 1435 Scope = SIAtomicScope::AGENT; 1436 } 1437 break; 1438 case SIAtomicScope::WAVEFRONT: 1439 case SIAtomicScope::SINGLETHREAD: 1440 // Same as GFX7. 1441 break; 1442 default: 1443 llvm_unreachable("Unsupported synchronization scope"); 1444 } 1445 } 1446 1447 /// The scratch address space does not need the global memory cache 1448 /// to be flushed as all memory operations by the same thread are 1449 /// sequentially consistent, and no other thread can access scratch 1450 /// memory. 1451 1452 /// Other address spaces do not have a cache. 1453 1454 if (Pos == Position::AFTER) 1455 --MI; 1456 1457 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1458 1459 return Changed; 1460 } 1461 1462 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1463 SIAtomicScope Scope, 1464 SIAtomicAddrSpace AddrSpace, 1465 bool IsCrossAddrSpaceOrdering, 1466 Position Pos) const { 1467 bool Changed = false; 1468 1469 MachineBasicBlock &MBB = *MI->getParent(); 1470 const DebugLoc &DL = MI->getDebugLoc(); 1471 1472 if (Pos == Position::AFTER) 1473 ++MI; 1474 1475 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1476 switch (Scope) { 1477 case SIAtomicScope::SYSTEM: 1478 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1479 // hardware does not reorder memory operations by the same wave with 1480 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1481 // to initiate writeback of any dirty cache lines of earlier writes by the 1482 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1483 // writeback has completed. 1484 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1485 // Set SC bits to indicate system scope. 1486 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1487 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1488 // vmcnt(0)" needed by the "BUFFER_WBL2". 1489 Changed = true; 1490 break; 1491 case SIAtomicScope::AGENT: 1492 case SIAtomicScope::WORKGROUP: 1493 case SIAtomicScope::WAVEFRONT: 1494 case SIAtomicScope::SINGLETHREAD: 1495 // Same as GFX7. 1496 break; 1497 default: 1498 llvm_unreachable("Unsupported synchronization scope"); 1499 } 1500 } 1501 1502 if (Pos == Position::AFTER) 1503 --MI; 1504 1505 Changed |= 1506 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1507 IsCrossAddrSpaceOrdering, Pos); 1508 1509 return Changed; 1510 } 1511 1512 bool SIGfx940CacheControl::enableLoadCacheBypass( 1513 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1514 SIAtomicAddrSpace AddrSpace) const { 1515 assert(MI->mayLoad() && !MI->mayStore()); 1516 bool Changed = false; 1517 1518 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1519 switch (Scope) { 1520 case SIAtomicScope::SYSTEM: 1521 // Set SC bits to indicate system scope. 1522 Changed |= enableSC0Bit(MI); 1523 Changed |= enableSC1Bit(MI); 1524 break; 1525 case SIAtomicScope::AGENT: 1526 // Set SC bits to indicate agent scope. 1527 Changed |= enableSC1Bit(MI); 1528 break; 1529 case SIAtomicScope::WORKGROUP: 1530 // In threadgroup split mode the waves of a work-group can be executing on 1531 // different CUs. Therefore need to bypass the L1 which is per CU. 1532 // Otherwise in non-threadgroup split mode all waves of a work-group are 1533 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1534 // bits to indicate work-group scope will do this automatically. 1535 Changed |= enableSC0Bit(MI); 1536 break; 1537 case SIAtomicScope::WAVEFRONT: 1538 case SIAtomicScope::SINGLETHREAD: 1539 // Leave SC bits unset to indicate wavefront scope. 1540 break; 1541 default: 1542 llvm_unreachable("Unsupported synchronization scope"); 1543 } 1544 } 1545 1546 /// The scratch address space does not need the global memory caches 1547 /// to be bypassed as all memory operations by the same thread are 1548 /// sequentially consistent, and no other thread can access scratch 1549 /// memory. 1550 1551 /// Other address spaces do not have a cache. 1552 1553 return Changed; 1554 } 1555 1556 bool SIGfx940CacheControl::enableStoreCacheBypass( 1557 const MachineBasicBlock::iterator &MI, 1558 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1559 assert(!MI->mayLoad() && MI->mayStore()); 1560 bool Changed = false; 1561 1562 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1563 switch (Scope) { 1564 case SIAtomicScope::SYSTEM: 1565 // Set SC bits to indicate system scope. 1566 Changed |= enableSC0Bit(MI); 1567 Changed |= enableSC1Bit(MI); 1568 break; 1569 case SIAtomicScope::AGENT: 1570 // Set SC bits to indicate agent scope. 1571 Changed |= enableSC1Bit(MI); 1572 break; 1573 case SIAtomicScope::WORKGROUP: 1574 // Set SC bits to indicate workgroup scope. 1575 Changed |= enableSC0Bit(MI); 1576 break; 1577 case SIAtomicScope::WAVEFRONT: 1578 case SIAtomicScope::SINGLETHREAD: 1579 // Leave SC bits unset to indicate wavefront scope. 1580 break; 1581 default: 1582 llvm_unreachable("Unsupported synchronization scope"); 1583 } 1584 } 1585 1586 /// The scratch address space does not need the global memory caches 1587 /// to be bypassed as all memory operations by the same thread are 1588 /// sequentially consistent, and no other thread can access scratch 1589 /// memory. 1590 1591 /// Other address spaces do not have a cache. 1592 1593 return Changed; 1594 } 1595 1596 bool SIGfx940CacheControl::enableRMWCacheBypass( 1597 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1598 SIAtomicAddrSpace AddrSpace) const { 1599 assert(MI->mayLoad() && MI->mayStore()); 1600 bool Changed = false; 1601 1602 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1603 switch (Scope) { 1604 case SIAtomicScope::SYSTEM: 1605 // Set SC1 bit to indicate system scope. 1606 Changed |= enableSC1Bit(MI); 1607 break; 1608 case SIAtomicScope::AGENT: 1609 case SIAtomicScope::WORKGROUP: 1610 case SIAtomicScope::WAVEFRONT: 1611 case SIAtomicScope::SINGLETHREAD: 1612 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1613 // to indicate system or agent scope. The SC0 bit is used to indicate if 1614 // they are return or no-return. Leave SC1 bit unset to indicate agent 1615 // scope. 1616 break; 1617 default: 1618 llvm_unreachable("Unsupported synchronization scope"); 1619 } 1620 } 1621 1622 return Changed; 1623 } 1624 1625 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1626 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1627 bool IsVolatile, bool IsNonTemporal) const { 1628 // Only handle load and store, not atomic read-modify-write insructions. The 1629 // latter use glc to indicate if the atomic returns a result and so must not 1630 // be used for cache control. 1631 assert(MI->mayLoad() ^ MI->mayStore()); 1632 1633 // Only update load and store, not LLVM IR atomic read-modify-write 1634 // instructions. The latter are always marked as volatile so cannot sensibly 1635 // handle it as do not want to pessimize all atomics. Also they do not support 1636 // the nontemporal attribute. 1637 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1638 1639 bool Changed = false; 1640 1641 if (IsVolatile) { 1642 // Set SC bits to indicate system scope. 1643 Changed |= enableSC0Bit(MI); 1644 Changed |= enableSC1Bit(MI); 1645 1646 // Ensure operation has completed at system scope to cause all volatile 1647 // operations to be visible outside the program in a global order. Do not 1648 // request cross address space as only the global address space can be 1649 // observable outside the program, so no need to cause a waitcnt for LDS 1650 // address space operations. 1651 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1652 Position::AFTER); 1653 1654 return Changed; 1655 } 1656 1657 if (IsNonTemporal) { 1658 Changed |= enableNTBit(MI); 1659 return Changed; 1660 } 1661 1662 return Changed; 1663 } 1664 1665 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1666 SIAtomicScope Scope, 1667 SIAtomicAddrSpace AddrSpace, 1668 Position Pos) const { 1669 if (!InsertCacheInv) 1670 return false; 1671 1672 bool Changed = false; 1673 1674 MachineBasicBlock &MBB = *MI->getParent(); 1675 DebugLoc DL = MI->getDebugLoc(); 1676 1677 if (Pos == Position::AFTER) 1678 ++MI; 1679 1680 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1681 switch (Scope) { 1682 case SIAtomicScope::SYSTEM: 1683 // Ensures that following loads will not see stale remote VMEM data or 1684 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1685 // CC will never be stale due to the local memory probes. 1686 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1687 // Set SC bits to indicate system scope. 1688 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1689 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1690 // hardware does not reorder memory operations by the same wave with 1691 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1692 // remove any cache lines of earlier writes by the same wave and ensures 1693 // later reads by the same wave will refetch the cache lines. 1694 Changed = true; 1695 break; 1696 case SIAtomicScope::AGENT: 1697 // Ensures that following loads will not see stale remote date or local 1698 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1699 // due to the memory probes. 1700 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1701 // Set SC bits to indicate agent scope. 1702 .addImm(AMDGPU::CPol::SC1); 1703 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1704 // does not reorder memory operations with respect to preceeding buffer 1705 // invalidate. The invalidate is guaranteed to remove any cache lines of 1706 // earlier writes and ensures later writes will refetch the cache lines. 1707 Changed = true; 1708 break; 1709 case SIAtomicScope::WORKGROUP: 1710 // In threadgroup split mode the waves of a work-group can be executing on 1711 // different CUs. Therefore need to invalidate the L1 which is per CU. 1712 // Otherwise in non-threadgroup split mode all waves of a work-group are 1713 // on the same CU, and so the L1 does not need to be invalidated. 1714 if (ST.isTgSplitEnabled()) { 1715 // Ensures L1 is invalidated if in threadgroup split mode. In 1716 // non-threadgroup split mode it is a NOP, but no point generating it in 1717 // that case if know not in that mode. 1718 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1719 // Set SC bits to indicate work-group scope. 1720 .addImm(AMDGPU::CPol::SC0); 1721 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1722 // does not reorder memory operations with respect to preceeding buffer 1723 // invalidate. The invalidate is guaranteed to remove any cache lines of 1724 // earlier writes and ensures later writes will refetch the cache lines. 1725 Changed = true; 1726 } 1727 break; 1728 case SIAtomicScope::WAVEFRONT: 1729 case SIAtomicScope::SINGLETHREAD: 1730 // Could generate "BUFFER_INV" but it would do nothing as there are no 1731 // caches to invalidate. 1732 break; 1733 default: 1734 llvm_unreachable("Unsupported synchronization scope"); 1735 } 1736 } 1737 1738 /// The scratch address space does not need the global memory cache 1739 /// to be flushed as all memory operations by the same thread are 1740 /// sequentially consistent, and no other thread can access scratch 1741 /// memory. 1742 1743 /// Other address spaces do not have a cache. 1744 1745 if (Pos == Position::AFTER) 1746 --MI; 1747 1748 return Changed; 1749 } 1750 1751 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1752 SIAtomicScope Scope, 1753 SIAtomicAddrSpace AddrSpace, 1754 bool IsCrossAddrSpaceOrdering, 1755 Position Pos) const { 1756 bool Changed = false; 1757 1758 MachineBasicBlock &MBB = *MI->getParent(); 1759 DebugLoc DL = MI->getDebugLoc(); 1760 1761 if (Pos == Position::AFTER) 1762 ++MI; 1763 1764 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1765 switch (Scope) { 1766 case SIAtomicScope::SYSTEM: 1767 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1768 // hardware does not reorder memory operations by the same wave with 1769 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1770 // to initiate writeback of any dirty cache lines of earlier writes by the 1771 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1772 // writeback has completed. 1773 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1774 // Set SC bits to indicate system scope. 1775 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1776 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1777 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1778 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1779 Changed = true; 1780 break; 1781 case SIAtomicScope::AGENT: 1782 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1783 // Set SC bits to indicate agent scope. 1784 .addImm(AMDGPU::CPol::SC1); 1785 1786 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1787 // SIAtomicScope::AGENT, the following insertWait will generate the 1788 // required "S_WAITCNT vmcnt(0)". 1789 Changed = true; 1790 break; 1791 case SIAtomicScope::WORKGROUP: 1792 case SIAtomicScope::WAVEFRONT: 1793 case SIAtomicScope::SINGLETHREAD: 1794 // Do not generate "BUFFER_WBL2" as there are no caches it would 1795 // writeback, and would require an otherwise unnecessary 1796 // "S_WAITCNT vmcnt(0)". 1797 break; 1798 default: 1799 llvm_unreachable("Unsupported synchronization scope"); 1800 } 1801 } 1802 1803 if (Pos == Position::AFTER) 1804 --MI; 1805 1806 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1807 // S_WAITCNT needed. 1808 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1809 IsCrossAddrSpaceOrdering, Pos); 1810 1811 return Changed; 1812 } 1813 1814 bool SIGfx10CacheControl::enableLoadCacheBypass( 1815 const MachineBasicBlock::iterator &MI, 1816 SIAtomicScope Scope, 1817 SIAtomicAddrSpace AddrSpace) const { 1818 assert(MI->mayLoad() && !MI->mayStore()); 1819 bool Changed = false; 1820 1821 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1822 switch (Scope) { 1823 case SIAtomicScope::SYSTEM: 1824 case SIAtomicScope::AGENT: 1825 // Set the L0 and L1 cache policies to MISS_EVICT. 1826 // Note: there is no L2 cache coherent bypass control at the ISA level. 1827 Changed |= enableGLCBit(MI); 1828 Changed |= enableDLCBit(MI); 1829 break; 1830 case SIAtomicScope::WORKGROUP: 1831 // In WGP mode the waves of a work-group can be executing on either CU of 1832 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1833 // CU mode all waves of a work-group are on the same CU, and so the L0 1834 // does not need to be bypassed. 1835 if (!ST.isCuModeEnabled()) 1836 Changed |= enableGLCBit(MI); 1837 break; 1838 case SIAtomicScope::WAVEFRONT: 1839 case SIAtomicScope::SINGLETHREAD: 1840 // No cache to bypass. 1841 break; 1842 default: 1843 llvm_unreachable("Unsupported synchronization scope"); 1844 } 1845 } 1846 1847 /// The scratch address space does not need the global memory caches 1848 /// to be bypassed as all memory operations by the same thread are 1849 /// sequentially consistent, and no other thread can access scratch 1850 /// memory. 1851 1852 /// Other address spaces do not have a cache. 1853 1854 return Changed; 1855 } 1856 1857 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1858 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1859 bool IsVolatile, bool IsNonTemporal) const { 1860 1861 // Only handle load and store, not atomic read-modify-write insructions. The 1862 // latter use glc to indicate if the atomic returns a result and so must not 1863 // be used for cache control. 1864 assert(MI->mayLoad() ^ MI->mayStore()); 1865 1866 // Only update load and store, not LLVM IR atomic read-modify-write 1867 // instructions. The latter are always marked as volatile so cannot sensibly 1868 // handle it as do not want to pessimize all atomics. Also they do not support 1869 // the nontemporal attribute. 1870 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1871 1872 bool Changed = false; 1873 1874 if (IsVolatile) { 1875 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1876 // and MISS_LRU for store instructions. 1877 // Note: there is no L2 cache coherent bypass control at the ISA level. 1878 if (Op == SIMemOp::LOAD) { 1879 Changed |= enableGLCBit(MI); 1880 Changed |= enableDLCBit(MI); 1881 } 1882 1883 // Ensure operation has completed at system scope to cause all volatile 1884 // operations to be visible outside the program in a global order. Do not 1885 // request cross address space as only the global address space can be 1886 // observable outside the program, so no need to cause a waitcnt for LDS 1887 // address space operations. 1888 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1889 Position::AFTER); 1890 return Changed; 1891 } 1892 1893 if (IsNonTemporal) { 1894 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1895 // and L2 cache policy to STREAM. 1896 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1897 // to MISS_EVICT and the L2 cache policy to STREAM. 1898 if (Op == SIMemOp::STORE) 1899 Changed |= enableGLCBit(MI); 1900 Changed |= enableSLCBit(MI); 1901 1902 return Changed; 1903 } 1904 1905 return Changed; 1906 } 1907 1908 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1909 SIAtomicScope Scope, 1910 SIAtomicAddrSpace AddrSpace, 1911 SIMemOp Op, 1912 bool IsCrossAddrSpaceOrdering, 1913 Position Pos) const { 1914 bool Changed = false; 1915 1916 MachineBasicBlock &MBB = *MI->getParent(); 1917 DebugLoc DL = MI->getDebugLoc(); 1918 1919 if (Pos == Position::AFTER) 1920 ++MI; 1921 1922 bool VMCnt = false; 1923 bool VSCnt = false; 1924 bool LGKMCnt = false; 1925 1926 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1927 SIAtomicAddrSpace::NONE) { 1928 switch (Scope) { 1929 case SIAtomicScope::SYSTEM: 1930 case SIAtomicScope::AGENT: 1931 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1932 VMCnt |= true; 1933 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1934 VSCnt |= true; 1935 break; 1936 case SIAtomicScope::WORKGROUP: 1937 // In WGP mode the waves of a work-group can be executing on either CU of 1938 // the WGP. Therefore need to wait for operations to complete to ensure 1939 // they are visible to waves in the other CU as the L0 is per CU. 1940 // Otherwise in CU mode and all waves of a work-group are on the same CU 1941 // which shares the same L0. 1942 if (!ST.isCuModeEnabled()) { 1943 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1944 VMCnt |= true; 1945 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1946 VSCnt |= true; 1947 } 1948 break; 1949 case SIAtomicScope::WAVEFRONT: 1950 case SIAtomicScope::SINGLETHREAD: 1951 // The L0 cache keeps all memory operations in order for 1952 // work-items in the same wavefront. 1953 break; 1954 default: 1955 llvm_unreachable("Unsupported synchronization scope"); 1956 } 1957 } 1958 1959 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1960 switch (Scope) { 1961 case SIAtomicScope::SYSTEM: 1962 case SIAtomicScope::AGENT: 1963 case SIAtomicScope::WORKGROUP: 1964 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1965 // not needed as LDS operations for all waves are executed in a total 1966 // global ordering as observed by all waves. Required if also 1967 // synchronizing with global/GDS memory as LDS operations could be 1968 // reordered with respect to later global/GDS memory operations of the 1969 // same wave. 1970 LGKMCnt |= IsCrossAddrSpaceOrdering; 1971 break; 1972 case SIAtomicScope::WAVEFRONT: 1973 case SIAtomicScope::SINGLETHREAD: 1974 // The LDS keeps all memory operations in order for 1975 // the same wavefront. 1976 break; 1977 default: 1978 llvm_unreachable("Unsupported synchronization scope"); 1979 } 1980 } 1981 1982 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1983 switch (Scope) { 1984 case SIAtomicScope::SYSTEM: 1985 case SIAtomicScope::AGENT: 1986 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1987 // is not needed as GDS operations for all waves are executed in a total 1988 // global ordering as observed by all waves. Required if also 1989 // synchronizing with global/LDS memory as GDS operations could be 1990 // reordered with respect to later global/LDS memory operations of the 1991 // same wave. 1992 LGKMCnt |= IsCrossAddrSpaceOrdering; 1993 break; 1994 case SIAtomicScope::WORKGROUP: 1995 case SIAtomicScope::WAVEFRONT: 1996 case SIAtomicScope::SINGLETHREAD: 1997 // The GDS keeps all memory operations in order for 1998 // the same work-group. 1999 break; 2000 default: 2001 llvm_unreachable("Unsupported synchronization scope"); 2002 } 2003 } 2004 2005 if (VMCnt || LGKMCnt) { 2006 unsigned WaitCntImmediate = 2007 AMDGPU::encodeWaitcnt(IV, 2008 VMCnt ? 0 : getVmcntBitMask(IV), 2009 getExpcntBitMask(IV), 2010 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 2011 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 2012 .addImm(WaitCntImmediate); 2013 Changed = true; 2014 } 2015 2016 if (VSCnt) { 2017 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 2018 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 2019 .addImm(0); 2020 Changed = true; 2021 } 2022 2023 if (Pos == Position::AFTER) 2024 --MI; 2025 2026 return Changed; 2027 } 2028 2029 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2030 SIAtomicScope Scope, 2031 SIAtomicAddrSpace AddrSpace, 2032 Position Pos) const { 2033 if (!InsertCacheInv) 2034 return false; 2035 2036 bool Changed = false; 2037 2038 MachineBasicBlock &MBB = *MI->getParent(); 2039 DebugLoc DL = MI->getDebugLoc(); 2040 2041 if (Pos == Position::AFTER) 2042 ++MI; 2043 2044 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2045 switch (Scope) { 2046 case SIAtomicScope::SYSTEM: 2047 case SIAtomicScope::AGENT: 2048 // The order of invalidates matter here. We must invalidate "outer in" 2049 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is 2050 // invalidated. 2051 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2052 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2053 Changed = true; 2054 break; 2055 case SIAtomicScope::WORKGROUP: 2056 // In WGP mode the waves of a work-group can be executing on either CU of 2057 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2058 // in CU mode and all waves of a work-group are on the same CU, and so the 2059 // L0 does not need to be invalidated. 2060 if (!ST.isCuModeEnabled()) { 2061 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2062 Changed = true; 2063 } 2064 break; 2065 case SIAtomicScope::WAVEFRONT: 2066 case SIAtomicScope::SINGLETHREAD: 2067 // No cache to invalidate. 2068 break; 2069 default: 2070 llvm_unreachable("Unsupported synchronization scope"); 2071 } 2072 } 2073 2074 /// The scratch address space does not need the global memory cache 2075 /// to be flushed as all memory operations by the same thread are 2076 /// sequentially consistent, and no other thread can access scratch 2077 /// memory. 2078 2079 /// Other address spaces do not have a cache. 2080 2081 if (Pos == Position::AFTER) 2082 --MI; 2083 2084 return Changed; 2085 } 2086 2087 bool SIGfx11CacheControl::enableLoadCacheBypass( 2088 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2089 SIAtomicAddrSpace AddrSpace) const { 2090 assert(MI->mayLoad() && !MI->mayStore()); 2091 bool Changed = false; 2092 2093 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2094 switch (Scope) { 2095 case SIAtomicScope::SYSTEM: 2096 case SIAtomicScope::AGENT: 2097 // Set the L0 and L1 cache policies to MISS_EVICT. 2098 // Note: there is no L2 cache coherent bypass control at the ISA level. 2099 Changed |= enableGLCBit(MI); 2100 break; 2101 case SIAtomicScope::WORKGROUP: 2102 // In WGP mode the waves of a work-group can be executing on either CU of 2103 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2104 // CU mode all waves of a work-group are on the same CU, and so the L0 2105 // does not need to be bypassed. 2106 if (!ST.isCuModeEnabled()) 2107 Changed |= enableGLCBit(MI); 2108 break; 2109 case SIAtomicScope::WAVEFRONT: 2110 case SIAtomicScope::SINGLETHREAD: 2111 // No cache to bypass. 2112 break; 2113 default: 2114 llvm_unreachable("Unsupported synchronization scope"); 2115 } 2116 } 2117 2118 /// The scratch address space does not need the global memory caches 2119 /// to be bypassed as all memory operations by the same thread are 2120 /// sequentially consistent, and no other thread can access scratch 2121 /// memory. 2122 2123 /// Other address spaces do not have a cache. 2124 2125 return Changed; 2126 } 2127 2128 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2129 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2130 bool IsVolatile, bool IsNonTemporal) const { 2131 2132 // Only handle load and store, not atomic read-modify-write insructions. The 2133 // latter use glc to indicate if the atomic returns a result and so must not 2134 // be used for cache control. 2135 assert(MI->mayLoad() ^ MI->mayStore()); 2136 2137 // Only update load and store, not LLVM IR atomic read-modify-write 2138 // instructions. The latter are always marked as volatile so cannot sensibly 2139 // handle it as do not want to pessimize all atomics. Also they do not support 2140 // the nontemporal attribute. 2141 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2142 2143 bool Changed = false; 2144 2145 if (IsVolatile) { 2146 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2147 // and MISS_LRU for store instructions. 2148 // Note: there is no L2 cache coherent bypass control at the ISA level. 2149 if (Op == SIMemOp::LOAD) 2150 Changed |= enableGLCBit(MI); 2151 2152 // Set MALL NOALLOC for load and store instructions. 2153 Changed |= enableDLCBit(MI); 2154 2155 // Ensure operation has completed at system scope to cause all volatile 2156 // operations to be visible outside the program in a global order. Do not 2157 // request cross address space as only the global address space can be 2158 // observable outside the program, so no need to cause a waitcnt for LDS 2159 // address space operations. 2160 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2161 Position::AFTER); 2162 return Changed; 2163 } 2164 2165 if (IsNonTemporal) { 2166 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2167 // and L2 cache policy to STREAM. 2168 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2169 // to MISS_EVICT and the L2 cache policy to STREAM. 2170 if (Op == SIMemOp::STORE) 2171 Changed |= enableGLCBit(MI); 2172 Changed |= enableSLCBit(MI); 2173 2174 // Set MALL NOALLOC for load and store instructions. 2175 Changed |= enableDLCBit(MI); 2176 return Changed; 2177 } 2178 2179 return Changed; 2180 } 2181 2182 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 2183 AMDGPU::CPol::CPol Value) const { 2184 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2185 if (!CPol) 2186 return false; 2187 2188 uint64_t NewTH = Value & AMDGPU::CPol::TH; 2189 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 2190 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 2191 return true; 2192 } 2193 2194 return false; 2195 } 2196 2197 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 2198 AMDGPU::CPol::CPol Value) const { 2199 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2200 if (!CPol) 2201 return false; 2202 2203 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 2204 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 2205 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 2206 return true; 2207 } 2208 2209 return false; 2210 } 2211 2212 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( 2213 const MachineBasicBlock::iterator MI) const { 2214 // TODO: implement flag for frontend to give us a hint not to insert waits. 2215 2216 MachineBasicBlock &MBB = *MI->getParent(); 2217 const DebugLoc &DL = MI->getDebugLoc(); 2218 2219 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); 2220 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); 2221 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); 2222 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); 2223 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); 2224 2225 return true; 2226 } 2227 2228 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 2229 SIAtomicScope Scope, 2230 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2231 bool IsCrossAddrSpaceOrdering, 2232 Position Pos) const { 2233 bool Changed = false; 2234 2235 MachineBasicBlock &MBB = *MI->getParent(); 2236 DebugLoc DL = MI->getDebugLoc(); 2237 2238 bool LOADCnt = false; 2239 bool DSCnt = false; 2240 bool STORECnt = false; 2241 2242 if (Pos == Position::AFTER) 2243 ++MI; 2244 2245 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2246 SIAtomicAddrSpace::NONE) { 2247 switch (Scope) { 2248 case SIAtomicScope::SYSTEM: 2249 case SIAtomicScope::AGENT: 2250 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2251 LOADCnt |= true; 2252 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2253 STORECnt |= true; 2254 break; 2255 case SIAtomicScope::WORKGROUP: 2256 // In WGP mode the waves of a work-group can be executing on either CU of 2257 // the WGP. Therefore need to wait for operations to complete to ensure 2258 // they are visible to waves in the other CU as the L0 is per CU. 2259 // Otherwise in CU mode and all waves of a work-group are on the same CU 2260 // which shares the same L0. 2261 if (!ST.isCuModeEnabled()) { 2262 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2263 LOADCnt |= true; 2264 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2265 STORECnt |= true; 2266 } 2267 break; 2268 case SIAtomicScope::WAVEFRONT: 2269 case SIAtomicScope::SINGLETHREAD: 2270 // The L0 cache keeps all memory operations in order for 2271 // work-items in the same wavefront. 2272 break; 2273 default: 2274 llvm_unreachable("Unsupported synchronization scope"); 2275 } 2276 } 2277 2278 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2279 switch (Scope) { 2280 case SIAtomicScope::SYSTEM: 2281 case SIAtomicScope::AGENT: 2282 case SIAtomicScope::WORKGROUP: 2283 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2284 // not needed as LDS operations for all waves are executed in a total 2285 // global ordering as observed by all waves. Required if also 2286 // synchronizing with global/GDS memory as LDS operations could be 2287 // reordered with respect to later global/GDS memory operations of the 2288 // same wave. 2289 DSCnt |= IsCrossAddrSpaceOrdering; 2290 break; 2291 case SIAtomicScope::WAVEFRONT: 2292 case SIAtomicScope::SINGLETHREAD: 2293 // The LDS keeps all memory operations in order for 2294 // the same wavefront. 2295 break; 2296 default: 2297 llvm_unreachable("Unsupported synchronization scope"); 2298 } 2299 } 2300 2301 if (LOADCnt) { 2302 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 2303 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 2304 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 2305 Changed = true; 2306 } 2307 2308 if (STORECnt) { 2309 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 2310 Changed = true; 2311 } 2312 2313 if (DSCnt) { 2314 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 2315 Changed = true; 2316 } 2317 2318 if (Pos == Position::AFTER) 2319 --MI; 2320 2321 return Changed; 2322 } 2323 2324 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2325 SIAtomicScope Scope, 2326 SIAtomicAddrSpace AddrSpace, 2327 Position Pos) const { 2328 if (!InsertCacheInv) 2329 return false; 2330 2331 MachineBasicBlock &MBB = *MI->getParent(); 2332 DebugLoc DL = MI->getDebugLoc(); 2333 2334 /// The scratch address space does not need the global memory cache 2335 /// to be flushed as all memory operations by the same thread are 2336 /// sequentially consistent, and no other thread can access scratch 2337 /// memory. 2338 2339 /// Other address spaces do not have a cache. 2340 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2341 return false; 2342 2343 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2344 switch (Scope) { 2345 case SIAtomicScope::SYSTEM: 2346 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2347 break; 2348 case SIAtomicScope::AGENT: 2349 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2350 break; 2351 case SIAtomicScope::WORKGROUP: 2352 // In WGP mode the waves of a work-group can be executing on either CU of 2353 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2354 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2355 // the L0 does not need to be invalidated. 2356 if (ST.isCuModeEnabled()) 2357 return false; 2358 2359 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2360 break; 2361 case SIAtomicScope::WAVEFRONT: 2362 case SIAtomicScope::SINGLETHREAD: 2363 // No cache to invalidate. 2364 return false; 2365 default: 2366 llvm_unreachable("Unsupported synchronization scope"); 2367 } 2368 2369 if (Pos == Position::AFTER) 2370 ++MI; 2371 2372 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2373 2374 if (Pos == Position::AFTER) 2375 --MI; 2376 2377 return true; 2378 } 2379 2380 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 2381 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2382 bool IsVolatile, bool IsNonTemporal) const { 2383 2384 // Only handle load and store, not atomic read-modify-write instructions. 2385 assert(MI->mayLoad() ^ MI->mayStore()); 2386 2387 // Only update load and store, not LLVM IR atomic read-modify-write 2388 // instructions. The latter are always marked as volatile so cannot sensibly 2389 // handle it as do not want to pessimize all atomics. Also they do not support 2390 // the nontemporal attribute. 2391 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2392 2393 bool Changed = false; 2394 2395 if (IsNonTemporal) { 2396 // Set non-temporal hint for all cache levels. 2397 Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 2398 } 2399 2400 if (IsVolatile) { 2401 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2402 2403 if (Op == SIMemOp::STORE) 2404 Changed |= insertWaitsBeforeSystemScopeStore(MI); 2405 2406 // Ensure operation has completed at system scope to cause all volatile 2407 // operations to be visible outside the program in a global order. Do not 2408 // request cross address space as only the global address space can be 2409 // observable outside the program, so no need to cause a waitcnt for LDS 2410 // address space operations. 2411 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2412 Position::AFTER); 2413 } 2414 2415 return Changed; 2416 } 2417 2418 bool SIGfx12CacheControl::expandSystemScopeStore( 2419 MachineBasicBlock::iterator &MI) const { 2420 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2421 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) 2422 return insertWaitsBeforeSystemScopeStore(MI); 2423 2424 return false; 2425 } 2426 2427 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2428 if (AtomicPseudoMIs.empty()) 2429 return false; 2430 2431 for (auto &MI : AtomicPseudoMIs) 2432 MI->eraseFromParent(); 2433 2434 AtomicPseudoMIs.clear(); 2435 return true; 2436 } 2437 2438 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2439 MachineBasicBlock::iterator &MI) { 2440 assert(MI->mayLoad() && !MI->mayStore()); 2441 2442 bool Changed = false; 2443 2444 if (MOI.isAtomic()) { 2445 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2446 MOI.getOrdering() == AtomicOrdering::Acquire || 2447 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2448 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2449 MOI.getOrderingAddrSpace()); 2450 } 2451 2452 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2453 Changed |= CC->insertWait(MI, MOI.getScope(), 2454 MOI.getOrderingAddrSpace(), 2455 SIMemOp::LOAD | SIMemOp::STORE, 2456 MOI.getIsCrossAddressSpaceOrdering(), 2457 Position::BEFORE); 2458 2459 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2460 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2461 Changed |= CC->insertWait(MI, MOI.getScope(), 2462 MOI.getInstrAddrSpace(), 2463 SIMemOp::LOAD, 2464 MOI.getIsCrossAddressSpaceOrdering(), 2465 Position::AFTER); 2466 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2467 MOI.getOrderingAddrSpace(), 2468 Position::AFTER); 2469 } 2470 2471 return Changed; 2472 } 2473 2474 // Atomic instructions already bypass caches to the scope specified by the 2475 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2476 // need additional treatment. 2477 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2478 SIMemOp::LOAD, MOI.isVolatile(), 2479 MOI.isNonTemporal()); 2480 return Changed; 2481 } 2482 2483 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2484 MachineBasicBlock::iterator &MI) { 2485 assert(!MI->mayLoad() && MI->mayStore()); 2486 2487 bool Changed = false; 2488 2489 if (MOI.isAtomic()) { 2490 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2491 MOI.getOrdering() == AtomicOrdering::Release || 2492 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2493 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2494 MOI.getOrderingAddrSpace()); 2495 } 2496 2497 if (MOI.getOrdering() == AtomicOrdering::Release || 2498 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2499 Changed |= CC->insertRelease(MI, MOI.getScope(), 2500 MOI.getOrderingAddrSpace(), 2501 MOI.getIsCrossAddressSpaceOrdering(), 2502 Position::BEFORE); 2503 2504 return Changed; 2505 } 2506 2507 // Atomic instructions already bypass caches to the scope specified by the 2508 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2509 // need additional treatment. 2510 Changed |= CC->enableVolatileAndOrNonTemporal( 2511 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2512 MOI.isNonTemporal()); 2513 2514 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is 2515 // instruction field, do not confuse it with atomic scope. 2516 Changed |= CC->expandSystemScopeStore(MI); 2517 return Changed; 2518 } 2519 2520 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2521 MachineBasicBlock::iterator &MI) { 2522 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2523 2524 AtomicPseudoMIs.push_back(MI); 2525 bool Changed = false; 2526 2527 if (MOI.isAtomic()) { 2528 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2529 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2530 SIMemOp::LOAD | SIMemOp::STORE, 2531 MOI.getIsCrossAddressSpaceOrdering(), 2532 Position::BEFORE); 2533 2534 if (MOI.getOrdering() == AtomicOrdering::Release || 2535 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2536 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2537 /// TODO: This relies on a barrier always generating a waitcnt 2538 /// for LDS to ensure it is not reordered with the completion of 2539 /// the proceeding LDS operations. If barrier had a memory 2540 /// ordering and memory scope, then library does not need to 2541 /// generate a fence. Could add support in this file for 2542 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2543 /// adding S_WAITCNT before a S_BARRIER. 2544 Changed |= CC->insertRelease(MI, MOI.getScope(), 2545 MOI.getOrderingAddrSpace(), 2546 MOI.getIsCrossAddressSpaceOrdering(), 2547 Position::BEFORE); 2548 2549 // TODO: If both release and invalidate are happening they could be combined 2550 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2551 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2552 // track cache invalidate and write back instructions. 2553 2554 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2555 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2556 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2557 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2558 MOI.getOrderingAddrSpace(), 2559 Position::BEFORE); 2560 2561 return Changed; 2562 } 2563 2564 return Changed; 2565 } 2566 2567 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2568 MachineBasicBlock::iterator &MI) { 2569 assert(MI->mayLoad() && MI->mayStore()); 2570 2571 bool Changed = false; 2572 2573 if (MOI.isAtomic()) { 2574 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2575 MOI.getOrdering() == AtomicOrdering::Acquire || 2576 MOI.getOrdering() == AtomicOrdering::Release || 2577 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2578 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2579 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2580 MOI.getInstrAddrSpace()); 2581 } 2582 2583 if (MOI.getOrdering() == AtomicOrdering::Release || 2584 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2585 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2586 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2587 Changed |= CC->insertRelease(MI, MOI.getScope(), 2588 MOI.getOrderingAddrSpace(), 2589 MOI.getIsCrossAddressSpaceOrdering(), 2590 Position::BEFORE); 2591 2592 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2593 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2594 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2595 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2596 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2597 Changed |= CC->insertWait(MI, MOI.getScope(), 2598 MOI.getInstrAddrSpace(), 2599 isAtomicRet(*MI) ? SIMemOp::LOAD : 2600 SIMemOp::STORE, 2601 MOI.getIsCrossAddressSpaceOrdering(), 2602 Position::AFTER); 2603 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2604 MOI.getOrderingAddrSpace(), 2605 Position::AFTER); 2606 } 2607 2608 return Changed; 2609 } 2610 2611 return Changed; 2612 } 2613 2614 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2615 bool Changed = false; 2616 2617 SIMemOpAccess MOA(MF); 2618 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2619 2620 for (auto &MBB : MF) { 2621 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2622 2623 // Unbundle instructions after the post-RA scheduler. 2624 if (MI->isBundle() && MI->mayLoadOrStore()) { 2625 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2626 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2627 I != E && I->isBundledWithPred(); ++I) { 2628 I->unbundleFromPred(); 2629 for (MachineOperand &MO : I->operands()) 2630 if (MO.isReg()) 2631 MO.setIsInternalRead(false); 2632 } 2633 2634 MI->eraseFromParent(); 2635 MI = II->getIterator(); 2636 } 2637 2638 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2639 continue; 2640 2641 if (const auto &MOI = MOA.getLoadInfo(MI)) 2642 Changed |= expandLoad(*MOI, MI); 2643 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2644 Changed |= expandStore(*MOI, MI); 2645 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2646 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2647 Changed |= expandAtomicFence(*MOI, MI); 2648 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2649 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2650 } 2651 } 2652 2653 Changed |= removeAtomicPseudoMIs(); 2654 return Changed; 2655 } 2656 2657 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2658 2659 char SIMemoryLegalizer::ID = 0; 2660 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2661 2662 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2663 return new SIMemoryLegalizer(); 2664 } 2665