1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/CodeGen/MachineBasicBlock.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h" 26 #include "llvm/Support/AtomicOrdering.h" 27 #include "llvm/TargetParser/TargetParser.h" 28 29 using namespace llvm; 30 using namespace llvm::AMDGPU; 31 32 #define DEBUG_TYPE "si-memory-legalizer" 33 #define PASS_NAME "SI Memory Legalizer" 34 35 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 37 cl::desc("Use this to skip inserting cache invalidating instructions.")); 38 39 namespace { 40 41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 42 43 /// Memory operation flags. Can be ORed together. 44 enum class SIMemOp { 45 NONE = 0u, 46 LOAD = 1u << 0, 47 STORE = 1u << 1, 48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 49 }; 50 51 /// Position to insert a new instruction relative to an existing 52 /// instruction. 53 enum class Position { 54 BEFORE, 55 AFTER 56 }; 57 58 /// The atomic synchronization scopes supported by the AMDGPU target. 59 enum class SIAtomicScope { 60 NONE, 61 SINGLETHREAD, 62 WAVEFRONT, 63 WORKGROUP, 64 AGENT, 65 SYSTEM 66 }; 67 68 /// The distinct address spaces supported by the AMDGPU target for 69 /// atomic memory operation. Can be ORed together. 70 enum class SIAtomicAddrSpace { 71 NONE = 0u, 72 GLOBAL = 1u << 0, 73 LDS = 1u << 1, 74 SCRATCH = 1u << 2, 75 GDS = 1u << 3, 76 OTHER = 1u << 4, 77 78 /// The address spaces that can be accessed by a FLAT instruction. 79 FLAT = GLOBAL | LDS | SCRATCH, 80 81 /// The address spaces that support atomic instructions. 82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 83 84 /// All address spaces. 85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 86 87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 88 }; 89 90 class SIMemOpInfo final { 91 private: 92 93 friend class SIMemOpAccess; 94 95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 97 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 100 bool IsCrossAddressSpaceOrdering = false; 101 bool IsVolatile = false; 102 bool IsNonTemporal = false; 103 bool IsLastUse = false; 104 105 SIMemOpInfo( 106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 107 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 110 bool IsCrossAddressSpaceOrdering = true, 111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, 112 bool IsVolatile = false, bool IsNonTemporal = false, 113 bool IsLastUse = false) 114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), 115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), 116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), 118 IsLastUse(IsLastUse) { 119 120 if (Ordering == AtomicOrdering::NotAtomic) { 121 assert(Scope == SIAtomicScope::NONE && 122 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 123 !IsCrossAddressSpaceOrdering && 124 FailureOrdering == AtomicOrdering::NotAtomic); 125 return; 126 } 127 128 assert(Scope != SIAtomicScope::NONE && 129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE && 131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 132 SIAtomicAddrSpace::NONE); 133 134 // There is also no cross address space ordering if the ordering 135 // address space is the same as the instruction address space and 136 // only contains a single address space. 137 if ((OrderingAddrSpace == InstrAddrSpace) && 138 isPowerOf2_32(uint32_t(InstrAddrSpace))) 139 this->IsCrossAddressSpaceOrdering = false; 140 141 // Limit the scope to the maximum supported by the instruction's address 142 // spaces. 143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 144 SIAtomicAddrSpace::NONE) { 145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 146 } else if ((InstrAddrSpace & 147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 148 SIAtomicAddrSpace::NONE) { 149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 150 } else if ((InstrAddrSpace & 151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 153 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 154 } 155 } 156 157 public: 158 /// \returns Atomic synchronization scope of the machine instruction used to 159 /// create this SIMemOpInfo. 160 SIAtomicScope getScope() const { 161 return Scope; 162 } 163 164 /// \returns Ordering constraint of the machine instruction used to 165 /// create this SIMemOpInfo. 166 AtomicOrdering getOrdering() const { 167 return Ordering; 168 } 169 170 /// \returns Failure ordering constraint of the machine instruction used to 171 /// create this SIMemOpInfo. 172 AtomicOrdering getFailureOrdering() const { 173 return FailureOrdering; 174 } 175 176 /// \returns The address spaces be accessed by the machine 177 /// instruction used to create this SIMemOpInfo. 178 SIAtomicAddrSpace getInstrAddrSpace() const { 179 return InstrAddrSpace; 180 } 181 182 /// \returns The address spaces that must be ordered by the machine 183 /// instruction used to create this SIMemOpInfo. 184 SIAtomicAddrSpace getOrderingAddrSpace() const { 185 return OrderingAddrSpace; 186 } 187 188 /// \returns Return true iff memory ordering of operations on 189 /// different address spaces is required. 190 bool getIsCrossAddressSpaceOrdering() const { 191 return IsCrossAddressSpaceOrdering; 192 } 193 194 /// \returns True if memory access of the machine instruction used to 195 /// create this SIMemOpInfo is volatile, false otherwise. 196 bool isVolatile() const { 197 return IsVolatile; 198 } 199 200 /// \returns True if memory access of the machine instruction used to 201 /// create this SIMemOpInfo is nontemporal, false otherwise. 202 bool isNonTemporal() const { 203 return IsNonTemporal; 204 } 205 206 /// \returns True if memory access of the machine instruction used to 207 /// create this SIMemOpInfo is last use, false otherwise. 208 bool isLastUse() const { return IsLastUse; } 209 210 /// \returns True if ordering constraint of the machine instruction used to 211 /// create this SIMemOpInfo is unordered or higher, false otherwise. 212 bool isAtomic() const { 213 return Ordering != AtomicOrdering::NotAtomic; 214 } 215 216 }; 217 218 class SIMemOpAccess final { 219 private: 220 AMDGPUMachineModuleInfo *MMI = nullptr; 221 222 /// Reports unsupported message \p Msg for \p MI to LLVM context. 223 void reportUnsupported(const MachineBasicBlock::iterator &MI, 224 const char *Msg) const; 225 226 /// Inspects the target synchronization scope \p SSID and determines 227 /// the SI atomic scope it corresponds to, the address spaces it 228 /// covers, and whether the memory ordering applies between address 229 /// spaces. 230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 232 233 /// \return Return a bit set of the address spaces accessed by \p AS. 234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 235 236 /// \returns Info constructed from \p MI, which has at least machine memory 237 /// operand. 238 std::optional<SIMemOpInfo> 239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 240 241 public: 242 /// Construct class to support accessing the machine memory operands 243 /// of instructions in the machine function \p MF. 244 SIMemOpAccess(MachineFunction &MF); 245 246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 247 std::optional<SIMemOpInfo> 248 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 249 250 /// \returns Store info if \p MI is a store operation, "std::nullopt" 251 /// otherwise. 252 std::optional<SIMemOpInfo> 253 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 254 255 /// \returns Atomic fence info if \p MI is an atomic fence operation, 256 /// "std::nullopt" otherwise. 257 std::optional<SIMemOpInfo> 258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 259 260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 261 /// rmw operation, "std::nullopt" otherwise. 262 std::optional<SIMemOpInfo> 263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 264 }; 265 266 class SICacheControl { 267 protected: 268 269 /// AMDGPU subtarget info. 270 const GCNSubtarget &ST; 271 272 /// Instruction info. 273 const SIInstrInfo *TII = nullptr; 274 275 IsaVersion IV; 276 277 /// Whether to insert cache invalidating instructions. 278 bool InsertCacheInv; 279 280 SICacheControl(const GCNSubtarget &ST); 281 282 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 283 /// \returns Returns true if \p MI is modified, false otherwise. 284 bool enableNamedBit(const MachineBasicBlock::iterator MI, 285 AMDGPU::CPol::CPol Bit) const; 286 287 public: 288 289 /// Create a cache control for the subtarget \p ST. 290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 291 292 /// Update \p MI memory load instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory store instruction to bypass any caches up to 300 /// the \p Scope memory scope for address spaces \p 301 /// AddrSpace. Return true iff the instruction was modified. 302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory read-modify-write instruction to bypass any caches up 307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 308 /// iff the instruction was modified. 309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 310 SIAtomicScope Scope, 311 SIAtomicAddrSpace AddrSpace) const = 0; 312 313 /// Update \p MI memory instruction of kind \p Op associated with address 314 /// spaces \p AddrSpace to indicate it is volatile and/or 315 /// nontemporal/last-use. Return true iff the instruction was modified. 316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 317 SIAtomicAddrSpace AddrSpace, 318 SIMemOp Op, bool IsVolatile, 319 bool IsNonTemporal, 320 bool IsLastUse = false) const = 0; 321 322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { 323 return false; 324 }; 325 326 /// Inserts any necessary instructions at position \p Pos relative 327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 328 /// \p Op associated with address spaces \p AddrSpace have completed. Used 329 /// between memory instructions to enforce the order they become visible as 330 /// observed by other memory instructions executing in memory scope \p Scope. 331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 332 /// address spaces. Returns true iff any instructions inserted. 333 virtual bool insertWait(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 SIMemOp Op, 337 bool IsCrossAddrSpaceOrdering, 338 Position Pos) const = 0; 339 340 /// Inserts any necessary instructions at position \p Pos relative to 341 /// instruction \p MI to ensure any subsequent memory instructions of this 342 /// thread with address spaces \p AddrSpace will observe the previous memory 343 /// operations by any thread for memory scopes up to memory scope \p Scope . 344 /// Returns true iff any instructions inserted. 345 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 Position Pos) const = 0; 349 350 /// Inserts any necessary instructions at position \p Pos relative to 351 /// instruction \p MI to ensure previous memory instructions by this thread 352 /// with address spaces \p AddrSpace have completed and can be observed by 353 /// subsequent memory instructions by any thread executing in memory scope \p 354 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 355 /// between address spaces. Returns true iff any instructions inserted. 356 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 357 SIAtomicScope Scope, 358 SIAtomicAddrSpace AddrSpace, 359 bool IsCrossAddrSpaceOrdering, 360 Position Pos) const = 0; 361 362 /// Virtual destructor to allow derivations to be deleted. 363 virtual ~SICacheControl() = default; 364 365 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 366 MachineBasicBlock::iterator &MI) const { 367 return false; 368 } 369 }; 370 371 class SIGfx6CacheControl : public SICacheControl { 372 protected: 373 374 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 375 /// is modified, false otherwise. 376 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 377 return enableNamedBit(MI, AMDGPU::CPol::GLC); 378 } 379 380 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 381 /// is modified, false otherwise. 382 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 383 return enableNamedBit(MI, AMDGPU::CPol::SLC); 384 } 385 386 public: 387 388 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 389 390 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace) const override; 393 394 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 395 SIAtomicScope Scope, 396 SIAtomicAddrSpace AddrSpace) const override; 397 398 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 399 SIAtomicScope Scope, 400 SIAtomicAddrSpace AddrSpace) const override; 401 402 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 403 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 404 bool IsVolatile, bool IsNonTemporal, 405 bool IsLastUse) const override; 406 407 bool insertWait(MachineBasicBlock::iterator &MI, 408 SIAtomicScope Scope, 409 SIAtomicAddrSpace AddrSpace, 410 SIMemOp Op, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 bool insertRelease(MachineBasicBlock::iterator &MI, 420 SIAtomicScope Scope, 421 SIAtomicAddrSpace AddrSpace, 422 bool IsCrossAddrSpaceOrdering, 423 Position Pos) const override; 424 }; 425 426 class SIGfx7CacheControl : public SIGfx6CacheControl { 427 public: 428 429 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 430 431 bool insertAcquire(MachineBasicBlock::iterator &MI, 432 SIAtomicScope Scope, 433 SIAtomicAddrSpace AddrSpace, 434 Position Pos) const override; 435 436 }; 437 438 class SIGfx90ACacheControl : public SIGfx7CacheControl { 439 public: 440 441 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 442 443 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace) const override; 446 447 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 448 SIAtomicScope Scope, 449 SIAtomicAddrSpace AddrSpace) const override; 450 451 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 452 SIAtomicScope Scope, 453 SIAtomicAddrSpace AddrSpace) const override; 454 455 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 456 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 457 bool IsVolatile, bool IsNonTemporal, 458 bool IsLastUse) const override; 459 460 bool insertWait(MachineBasicBlock::iterator &MI, 461 SIAtomicScope Scope, 462 SIAtomicAddrSpace AddrSpace, 463 SIMemOp Op, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 467 bool insertAcquire(MachineBasicBlock::iterator &MI, 468 SIAtomicScope Scope, 469 SIAtomicAddrSpace AddrSpace, 470 Position Pos) const override; 471 472 bool insertRelease(MachineBasicBlock::iterator &MI, 473 SIAtomicScope Scope, 474 SIAtomicAddrSpace AddrSpace, 475 bool IsCrossAddrSpaceOrdering, 476 Position Pos) const override; 477 }; 478 479 class SIGfx940CacheControl : public SIGfx90ACacheControl { 480 protected: 481 482 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 483 /// is modified, false otherwise. 484 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 485 return enableNamedBit(MI, AMDGPU::CPol::SC0); 486 } 487 488 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 489 /// is modified, false otherwise. 490 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 491 return enableNamedBit(MI, AMDGPU::CPol::SC1); 492 } 493 494 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 495 /// is modified, false otherwise. 496 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 497 return enableNamedBit(MI, AMDGPU::CPol::NT); 498 } 499 500 public: 501 502 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 503 504 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 505 SIAtomicScope Scope, 506 SIAtomicAddrSpace AddrSpace) const override; 507 508 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 509 SIAtomicScope Scope, 510 SIAtomicAddrSpace AddrSpace) const override; 511 512 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 513 SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace) const override; 515 516 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 517 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 518 bool IsVolatile, bool IsNonTemporal, 519 bool IsLastUse) const override; 520 521 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 522 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 523 524 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 525 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 526 Position Pos) const override; 527 528 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 529 MachineBasicBlock::iterator &MI) const override { 530 bool Changed = false; 531 if (ST.hasForceStoreSC0SC1() && 532 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 533 SIAtomicAddrSpace::GLOBAL | 534 SIAtomicAddrSpace::OTHER)) != 535 SIAtomicAddrSpace::NONE) { 536 Changed |= enableSC0Bit(MI); 537 Changed |= enableSC1Bit(MI); 538 } 539 return Changed; 540 } 541 }; 542 543 class SIGfx10CacheControl : public SIGfx7CacheControl { 544 protected: 545 546 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 547 /// is modified, false otherwise. 548 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 549 return enableNamedBit(MI, AMDGPU::CPol::DLC); 550 } 551 552 public: 553 554 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 555 556 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 557 SIAtomicScope Scope, 558 SIAtomicAddrSpace AddrSpace) const override; 559 560 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 561 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 562 bool IsVolatile, bool IsNonTemporal, 563 bool IsLastUse) const override; 564 565 bool insertWait(MachineBasicBlock::iterator &MI, 566 SIAtomicScope Scope, 567 SIAtomicAddrSpace AddrSpace, 568 SIMemOp Op, 569 bool IsCrossAddrSpaceOrdering, 570 Position Pos) const override; 571 572 bool insertAcquire(MachineBasicBlock::iterator &MI, 573 SIAtomicScope Scope, 574 SIAtomicAddrSpace AddrSpace, 575 Position Pos) const override; 576 }; 577 578 class SIGfx11CacheControl : public SIGfx10CacheControl { 579 public: 580 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 581 582 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 583 SIAtomicScope Scope, 584 SIAtomicAddrSpace AddrSpace) const override; 585 586 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 587 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 588 bool IsVolatile, bool IsNonTemporal, 589 bool IsLastUse) const override; 590 }; 591 592 class SIGfx12CacheControl : public SIGfx11CacheControl { 593 protected: 594 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 595 // \returns Returns true if \p MI is modified, false otherwise. 596 bool setTH(const MachineBasicBlock::iterator MI, 597 AMDGPU::CPol::CPol Value) const; 598 // Sets Scope policy to \p Value if CPol operand is present in instruction \p 599 // MI. \returns Returns true if \p MI is modified, false otherwise. 600 bool setScope(const MachineBasicBlock::iterator MI, 601 AMDGPU::CPol::CPol Value) const; 602 603 // Stores with system scope (SCOPE_SYS) need to wait for: 604 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 605 // - non-returning-atomics - wait for STORECNT==0 606 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits 607 // since it does not distinguish atomics-with-return from regular stores. 608 // There is no need to wait if memory is cached (mtype != UC). 609 bool 610 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; 611 612 bool setAtomicScope(const MachineBasicBlock::iterator &MI, 613 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; 614 615 public: 616 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 617 618 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 619 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 620 bool IsCrossAddrSpaceOrdering, Position Pos) const override; 621 622 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 623 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 624 625 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 626 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 627 bool IsVolatile, bool IsNonTemporal, 628 bool IsLastUse) const override; 629 630 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; 631 632 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 633 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 634 Position Pos) const override; 635 636 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 637 SIAtomicScope Scope, 638 SIAtomicAddrSpace AddrSpace) const override { 639 return setAtomicScope(MI, Scope, AddrSpace); 640 } 641 642 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 643 SIAtomicScope Scope, 644 SIAtomicAddrSpace AddrSpace) const override { 645 return setAtomicScope(MI, Scope, AddrSpace); 646 } 647 648 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 649 SIAtomicScope Scope, 650 SIAtomicAddrSpace AddrSpace) const override { 651 return setAtomicScope(MI, Scope, AddrSpace); 652 } 653 }; 654 655 class SIMemoryLegalizer final : public MachineFunctionPass { 656 private: 657 658 /// Cache Control. 659 std::unique_ptr<SICacheControl> CC = nullptr; 660 661 /// List of atomic pseudo instructions. 662 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 663 664 /// Return true iff instruction \p MI is a atomic instruction that 665 /// returns a result. 666 bool isAtomicRet(const MachineInstr &MI) const { 667 return SIInstrInfo::isAtomicRet(MI); 668 } 669 670 /// Removes all processed atomic pseudo instructions from the current 671 /// function. Returns true if current function is modified, false otherwise. 672 bool removeAtomicPseudoMIs(); 673 674 /// Expands load operation \p MI. Returns true if instructions are 675 /// added/deleted or \p MI is modified, false otherwise. 676 bool expandLoad(const SIMemOpInfo &MOI, 677 MachineBasicBlock::iterator &MI); 678 /// Expands store operation \p MI. Returns true if instructions are 679 /// added/deleted or \p MI is modified, false otherwise. 680 bool expandStore(const SIMemOpInfo &MOI, 681 MachineBasicBlock::iterator &MI); 682 /// Expands atomic fence operation \p MI. Returns true if 683 /// instructions are added/deleted or \p MI is modified, false otherwise. 684 bool expandAtomicFence(const SIMemOpInfo &MOI, 685 MachineBasicBlock::iterator &MI); 686 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 687 /// instructions are added/deleted or \p MI is modified, false otherwise. 688 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 689 MachineBasicBlock::iterator &MI); 690 691 public: 692 static char ID; 693 694 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 695 696 void getAnalysisUsage(AnalysisUsage &AU) const override { 697 AU.setPreservesCFG(); 698 MachineFunctionPass::getAnalysisUsage(AU); 699 } 700 701 StringRef getPassName() const override { 702 return PASS_NAME; 703 } 704 705 bool runOnMachineFunction(MachineFunction &MF) override; 706 }; 707 708 static const StringMap<SIAtomicAddrSpace> ASNames = {{ 709 {"global", SIAtomicAddrSpace::GLOBAL}, 710 {"local", SIAtomicAddrSpace::LDS}, 711 }}; 712 713 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { 714 const MachineFunction *MF = MI.getMF(); 715 const Function &Fn = MF->getFunction(); 716 SmallString<128> Str; 717 raw_svector_ostream OS(Str); 718 OS << "unknown address space '" << AS << "'; expected one of "; 719 ListSeparator LS; 720 for (const auto &[Name, Val] : ASNames) 721 OS << LS << '\'' << Name << '\''; 722 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning); 723 Fn.getContext().diagnose(BadTag); 724 } 725 726 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. 727 /// If this tag isn't present, or if it has no meaningful values, returns \p 728 /// Default. Otherwise returns all the address spaces concerned by the MMRA. 729 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, 730 SIAtomicAddrSpace Default) { 731 static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; 732 733 auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); 734 if (!MMRA) 735 return Default; 736 737 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; 738 for (const auto &[Prefix, Suffix] : MMRA) { 739 if (Prefix != FenceASPrefix) 740 continue; 741 742 if (auto It = ASNames.find(Suffix); It != ASNames.end()) 743 Result |= It->second; 744 else 745 diagnoseUnknownMMRAASName(MI, Suffix); 746 } 747 748 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; 749 } 750 751 } // end namespace anonymous 752 753 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 754 const char *Msg) const { 755 const Function &Func = MI->getParent()->getParent()->getFunction(); 756 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 757 Func.getContext().diagnose(Diag); 758 } 759 760 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 761 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 762 SIAtomicAddrSpace InstrAddrSpace) const { 763 if (SSID == SyncScope::System) 764 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 765 if (SSID == MMI->getAgentSSID()) 766 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 767 if (SSID == MMI->getWorkgroupSSID()) 768 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 769 true); 770 if (SSID == MMI->getWavefrontSSID()) 771 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 772 true); 773 if (SSID == SyncScope::SingleThread) 774 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 775 true); 776 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 777 return std::tuple(SIAtomicScope::SYSTEM, 778 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 779 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 780 return std::tuple(SIAtomicScope::AGENT, 781 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 782 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 783 return std::tuple(SIAtomicScope::WORKGROUP, 784 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 785 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 786 return std::tuple(SIAtomicScope::WAVEFRONT, 787 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 788 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 789 return std::tuple(SIAtomicScope::SINGLETHREAD, 790 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 791 return std::nullopt; 792 } 793 794 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 795 if (AS == AMDGPUAS::FLAT_ADDRESS) 796 return SIAtomicAddrSpace::FLAT; 797 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 798 return SIAtomicAddrSpace::GLOBAL; 799 if (AS == AMDGPUAS::LOCAL_ADDRESS) 800 return SIAtomicAddrSpace::LDS; 801 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 802 return SIAtomicAddrSpace::SCRATCH; 803 if (AS == AMDGPUAS::REGION_ADDRESS) 804 return SIAtomicAddrSpace::GDS; 805 806 return SIAtomicAddrSpace::OTHER; 807 } 808 809 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 810 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 811 } 812 813 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 814 const MachineBasicBlock::iterator &MI) const { 815 assert(MI->getNumMemOperands() > 0); 816 817 SyncScope::ID SSID = SyncScope::SingleThread; 818 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 819 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 820 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 821 bool IsNonTemporal = true; 822 bool IsVolatile = false; 823 bool IsLastUse = false; 824 825 // Validator should check whether or not MMOs cover the entire set of 826 // locations accessed by the memory instruction. 827 for (const auto &MMO : MI->memoperands()) { 828 IsNonTemporal &= MMO->isNonTemporal(); 829 IsVolatile |= MMO->isVolatile(); 830 IsLastUse |= MMO->getFlags() & MOLastUse; 831 InstrAddrSpace |= 832 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 833 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 834 if (OpOrdering != AtomicOrdering::NotAtomic) { 835 const auto &IsSyncScopeInclusion = 836 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 837 if (!IsSyncScopeInclusion) { 838 reportUnsupported(MI, 839 "Unsupported non-inclusive atomic synchronization scope"); 840 return std::nullopt; 841 } 842 843 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 844 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 845 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 846 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 847 FailureOrdering = 848 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 849 } 850 } 851 852 SIAtomicScope Scope = SIAtomicScope::NONE; 853 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 854 bool IsCrossAddressSpaceOrdering = false; 855 if (Ordering != AtomicOrdering::NotAtomic) { 856 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 857 if (!ScopeOrNone) { 858 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 859 return std::nullopt; 860 } 861 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 862 *ScopeOrNone; 863 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 864 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 865 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 866 reportUnsupported(MI, "Unsupported atomic address space"); 867 return std::nullopt; 868 } 869 } 870 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 871 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 872 IsNonTemporal, IsLastUse); 873 } 874 875 std::optional<SIMemOpInfo> 876 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 877 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 878 879 if (!(MI->mayLoad() && !MI->mayStore())) 880 return std::nullopt; 881 882 // Be conservative if there are no memory operands. 883 if (MI->getNumMemOperands() == 0) 884 return SIMemOpInfo(); 885 886 return constructFromMIWithMMO(MI); 887 } 888 889 std::optional<SIMemOpInfo> 890 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 891 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 892 893 if (!(!MI->mayLoad() && MI->mayStore())) 894 return std::nullopt; 895 896 // Be conservative if there are no memory operands. 897 if (MI->getNumMemOperands() == 0) 898 return SIMemOpInfo(); 899 900 return constructFromMIWithMMO(MI); 901 } 902 903 std::optional<SIMemOpInfo> 904 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 905 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 906 907 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 908 return std::nullopt; 909 910 AtomicOrdering Ordering = 911 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 912 913 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 914 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 915 if (!ScopeOrNone) { 916 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 917 return std::nullopt; 918 } 919 920 SIAtomicScope Scope = SIAtomicScope::NONE; 921 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 922 bool IsCrossAddressSpaceOrdering = false; 923 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 924 *ScopeOrNone; 925 926 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 927 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 928 reportUnsupported(MI, "Unsupported atomic address space"); 929 return std::nullopt; 930 } 931 932 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 933 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 934 } 935 936 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 937 const MachineBasicBlock::iterator &MI) const { 938 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 939 940 if (!(MI->mayLoad() && MI->mayStore())) 941 return std::nullopt; 942 943 // Be conservative if there are no memory operands. 944 if (MI->getNumMemOperands() == 0) 945 return SIMemOpInfo(); 946 947 return constructFromMIWithMMO(MI); 948 } 949 950 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 951 TII = ST.getInstrInfo(); 952 IV = getIsaVersion(ST.getCPU()); 953 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 954 } 955 956 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 957 AMDGPU::CPol::CPol Bit) const { 958 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 959 if (!CPol) 960 return false; 961 962 CPol->setImm(CPol->getImm() | Bit); 963 return true; 964 } 965 966 /* static */ 967 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 968 GCNSubtarget::Generation Generation = ST.getGeneration(); 969 if (ST.hasGFX940Insts()) 970 return std::make_unique<SIGfx940CacheControl>(ST); 971 if (ST.hasGFX90AInsts()) 972 return std::make_unique<SIGfx90ACacheControl>(ST); 973 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 974 return std::make_unique<SIGfx6CacheControl>(ST); 975 if (Generation < AMDGPUSubtarget::GFX10) 976 return std::make_unique<SIGfx7CacheControl>(ST); 977 if (Generation < AMDGPUSubtarget::GFX11) 978 return std::make_unique<SIGfx10CacheControl>(ST); 979 if (Generation < AMDGPUSubtarget::GFX12) 980 return std::make_unique<SIGfx11CacheControl>(ST); 981 return std::make_unique<SIGfx12CacheControl>(ST); 982 } 983 984 bool SIGfx6CacheControl::enableLoadCacheBypass( 985 const MachineBasicBlock::iterator &MI, 986 SIAtomicScope Scope, 987 SIAtomicAddrSpace AddrSpace) const { 988 assert(MI->mayLoad() && !MI->mayStore()); 989 bool Changed = false; 990 991 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 992 switch (Scope) { 993 case SIAtomicScope::SYSTEM: 994 case SIAtomicScope::AGENT: 995 // Set L1 cache policy to MISS_EVICT. 996 // Note: there is no L2 cache bypass policy at the ISA level. 997 Changed |= enableGLCBit(MI); 998 break; 999 case SIAtomicScope::WORKGROUP: 1000 case SIAtomicScope::WAVEFRONT: 1001 case SIAtomicScope::SINGLETHREAD: 1002 // No cache to bypass. 1003 break; 1004 default: 1005 llvm_unreachable("Unsupported synchronization scope"); 1006 } 1007 } 1008 1009 /// The scratch address space does not need the global memory caches 1010 /// to be bypassed as all memory operations by the same thread are 1011 /// sequentially consistent, and no other thread can access scratch 1012 /// memory. 1013 1014 /// Other address spaces do not have a cache. 1015 1016 return Changed; 1017 } 1018 1019 bool SIGfx6CacheControl::enableStoreCacheBypass( 1020 const MachineBasicBlock::iterator &MI, 1021 SIAtomicScope Scope, 1022 SIAtomicAddrSpace AddrSpace) const { 1023 assert(!MI->mayLoad() && MI->mayStore()); 1024 bool Changed = false; 1025 1026 /// The L1 cache is write through so does not need to be bypassed. There is no 1027 /// bypass control for the L2 cache at the isa level. 1028 1029 return Changed; 1030 } 1031 1032 bool SIGfx6CacheControl::enableRMWCacheBypass( 1033 const MachineBasicBlock::iterator &MI, 1034 SIAtomicScope Scope, 1035 SIAtomicAddrSpace AddrSpace) const { 1036 assert(MI->mayLoad() && MI->mayStore()); 1037 bool Changed = false; 1038 1039 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 1040 /// bypassed, and the GLC bit is instead used to indicate if they are 1041 /// return or no-return. 1042 /// Note: there is no L2 cache coherent bypass control at the ISA level. 1043 1044 return Changed; 1045 } 1046 1047 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 1048 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1049 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1050 // Only handle load and store, not atomic read-modify-write insructions. The 1051 // latter use glc to indicate if the atomic returns a result and so must not 1052 // be used for cache control. 1053 assert(MI->mayLoad() ^ MI->mayStore()); 1054 1055 // Only update load and store, not LLVM IR atomic read-modify-write 1056 // instructions. The latter are always marked as volatile so cannot sensibly 1057 // handle it as do not want to pessimize all atomics. Also they do not support 1058 // the nontemporal attribute. 1059 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1060 1061 bool Changed = false; 1062 1063 if (IsVolatile) { 1064 // Set L1 cache policy to be MISS_EVICT for load instructions 1065 // and MISS_LRU for store instructions. 1066 // Note: there is no L2 cache bypass policy at the ISA level. 1067 if (Op == SIMemOp::LOAD) 1068 Changed |= enableGLCBit(MI); 1069 1070 // Ensure operation has completed at system scope to cause all volatile 1071 // operations to be visible outside the program in a global order. Do not 1072 // request cross address space as only the global address space can be 1073 // observable outside the program, so no need to cause a waitcnt for LDS 1074 // address space operations. 1075 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1076 Position::AFTER); 1077 1078 return Changed; 1079 } 1080 1081 if (IsNonTemporal) { 1082 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1083 // for both loads and stores, and the L2 cache policy to STREAM. 1084 Changed |= enableGLCBit(MI); 1085 Changed |= enableSLCBit(MI); 1086 return Changed; 1087 } 1088 1089 return Changed; 1090 } 1091 1092 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1093 SIAtomicScope Scope, 1094 SIAtomicAddrSpace AddrSpace, 1095 SIMemOp Op, 1096 bool IsCrossAddrSpaceOrdering, 1097 Position Pos) const { 1098 bool Changed = false; 1099 1100 MachineBasicBlock &MBB = *MI->getParent(); 1101 DebugLoc DL = MI->getDebugLoc(); 1102 1103 if (Pos == Position::AFTER) 1104 ++MI; 1105 1106 bool VMCnt = false; 1107 bool LGKMCnt = false; 1108 1109 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1110 SIAtomicAddrSpace::NONE) { 1111 switch (Scope) { 1112 case SIAtomicScope::SYSTEM: 1113 case SIAtomicScope::AGENT: 1114 VMCnt |= true; 1115 break; 1116 case SIAtomicScope::WORKGROUP: 1117 case SIAtomicScope::WAVEFRONT: 1118 case SIAtomicScope::SINGLETHREAD: 1119 // The L1 cache keeps all memory operations in order for 1120 // wavefronts in the same work-group. 1121 break; 1122 default: 1123 llvm_unreachable("Unsupported synchronization scope"); 1124 } 1125 } 1126 1127 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1128 switch (Scope) { 1129 case SIAtomicScope::SYSTEM: 1130 case SIAtomicScope::AGENT: 1131 case SIAtomicScope::WORKGROUP: 1132 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1133 // not needed as LDS operations for all waves are executed in a total 1134 // global ordering as observed by all waves. Required if also 1135 // synchronizing with global/GDS memory as LDS operations could be 1136 // reordered with respect to later global/GDS memory operations of the 1137 // same wave. 1138 LGKMCnt |= IsCrossAddrSpaceOrdering; 1139 break; 1140 case SIAtomicScope::WAVEFRONT: 1141 case SIAtomicScope::SINGLETHREAD: 1142 // The LDS keeps all memory operations in order for 1143 // the same wavefront. 1144 break; 1145 default: 1146 llvm_unreachable("Unsupported synchronization scope"); 1147 } 1148 } 1149 1150 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1151 switch (Scope) { 1152 case SIAtomicScope::SYSTEM: 1153 case SIAtomicScope::AGENT: 1154 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1155 // is not needed as GDS operations for all waves are executed in a total 1156 // global ordering as observed by all waves. Required if also 1157 // synchronizing with global/LDS memory as GDS operations could be 1158 // reordered with respect to later global/LDS memory operations of the 1159 // same wave. 1160 LGKMCnt |= IsCrossAddrSpaceOrdering; 1161 break; 1162 case SIAtomicScope::WORKGROUP: 1163 case SIAtomicScope::WAVEFRONT: 1164 case SIAtomicScope::SINGLETHREAD: 1165 // The GDS keeps all memory operations in order for 1166 // the same work-group. 1167 break; 1168 default: 1169 llvm_unreachable("Unsupported synchronization scope"); 1170 } 1171 } 1172 1173 if (VMCnt || LGKMCnt) { 1174 unsigned WaitCntImmediate = 1175 AMDGPU::encodeWaitcnt(IV, 1176 VMCnt ? 0 : getVmcntBitMask(IV), 1177 getExpcntBitMask(IV), 1178 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1179 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1180 .addImm(WaitCntImmediate); 1181 Changed = true; 1182 } 1183 1184 if (Pos == Position::AFTER) 1185 --MI; 1186 1187 return Changed; 1188 } 1189 1190 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1191 SIAtomicScope Scope, 1192 SIAtomicAddrSpace AddrSpace, 1193 Position Pos) const { 1194 if (!InsertCacheInv) 1195 return false; 1196 1197 bool Changed = false; 1198 1199 MachineBasicBlock &MBB = *MI->getParent(); 1200 DebugLoc DL = MI->getDebugLoc(); 1201 1202 if (Pos == Position::AFTER) 1203 ++MI; 1204 1205 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1206 switch (Scope) { 1207 case SIAtomicScope::SYSTEM: 1208 case SIAtomicScope::AGENT: 1209 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1210 Changed = true; 1211 break; 1212 case SIAtomicScope::WORKGROUP: 1213 case SIAtomicScope::WAVEFRONT: 1214 case SIAtomicScope::SINGLETHREAD: 1215 // No cache to invalidate. 1216 break; 1217 default: 1218 llvm_unreachable("Unsupported synchronization scope"); 1219 } 1220 } 1221 1222 /// The scratch address space does not need the global memory cache 1223 /// to be flushed as all memory operations by the same thread are 1224 /// sequentially consistent, and no other thread can access scratch 1225 /// memory. 1226 1227 /// Other address spaces do not have a cache. 1228 1229 if (Pos == Position::AFTER) 1230 --MI; 1231 1232 return Changed; 1233 } 1234 1235 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1236 SIAtomicScope Scope, 1237 SIAtomicAddrSpace AddrSpace, 1238 bool IsCrossAddrSpaceOrdering, 1239 Position Pos) const { 1240 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1241 IsCrossAddrSpaceOrdering, Pos); 1242 } 1243 1244 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1245 SIAtomicScope Scope, 1246 SIAtomicAddrSpace AddrSpace, 1247 Position Pos) const { 1248 if (!InsertCacheInv) 1249 return false; 1250 1251 bool Changed = false; 1252 1253 MachineBasicBlock &MBB = *MI->getParent(); 1254 DebugLoc DL = MI->getDebugLoc(); 1255 1256 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1257 1258 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1259 ? AMDGPU::BUFFER_WBINVL1 1260 : AMDGPU::BUFFER_WBINVL1_VOL; 1261 1262 if (Pos == Position::AFTER) 1263 ++MI; 1264 1265 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1266 switch (Scope) { 1267 case SIAtomicScope::SYSTEM: 1268 case SIAtomicScope::AGENT: 1269 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1270 Changed = true; 1271 break; 1272 case SIAtomicScope::WORKGROUP: 1273 case SIAtomicScope::WAVEFRONT: 1274 case SIAtomicScope::SINGLETHREAD: 1275 // No cache to invalidate. 1276 break; 1277 default: 1278 llvm_unreachable("Unsupported synchronization scope"); 1279 } 1280 } 1281 1282 /// The scratch address space does not need the global memory cache 1283 /// to be flushed as all memory operations by the same thread are 1284 /// sequentially consistent, and no other thread can access scratch 1285 /// memory. 1286 1287 /// Other address spaces do not have a cache. 1288 1289 if (Pos == Position::AFTER) 1290 --MI; 1291 1292 return Changed; 1293 } 1294 1295 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1296 const MachineBasicBlock::iterator &MI, 1297 SIAtomicScope Scope, 1298 SIAtomicAddrSpace AddrSpace) const { 1299 assert(MI->mayLoad() && !MI->mayStore()); 1300 bool Changed = false; 1301 1302 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1303 switch (Scope) { 1304 case SIAtomicScope::SYSTEM: 1305 case SIAtomicScope::AGENT: 1306 // Set the L1 cache policy to MISS_LRU. 1307 // Note: there is no L2 cache bypass policy at the ISA level. 1308 Changed |= enableGLCBit(MI); 1309 break; 1310 case SIAtomicScope::WORKGROUP: 1311 // In threadgroup split mode the waves of a work-group can be executing on 1312 // different CUs. Therefore need to bypass the L1 which is per CU. 1313 // Otherwise in non-threadgroup split mode all waves of a work-group are 1314 // on the same CU, and so the L1 does not need to be bypassed. 1315 if (ST.isTgSplitEnabled()) 1316 Changed |= enableGLCBit(MI); 1317 break; 1318 case SIAtomicScope::WAVEFRONT: 1319 case SIAtomicScope::SINGLETHREAD: 1320 // No cache to bypass. 1321 break; 1322 default: 1323 llvm_unreachable("Unsupported synchronization scope"); 1324 } 1325 } 1326 1327 /// The scratch address space does not need the global memory caches 1328 /// to be bypassed as all memory operations by the same thread are 1329 /// sequentially consistent, and no other thread can access scratch 1330 /// memory. 1331 1332 /// Other address spaces do not have a cache. 1333 1334 return Changed; 1335 } 1336 1337 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1338 const MachineBasicBlock::iterator &MI, 1339 SIAtomicScope Scope, 1340 SIAtomicAddrSpace AddrSpace) const { 1341 assert(!MI->mayLoad() && MI->mayStore()); 1342 bool Changed = false; 1343 1344 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1345 switch (Scope) { 1346 case SIAtomicScope::SYSTEM: 1347 case SIAtomicScope::AGENT: 1348 /// Do not set glc for store atomic operations as they implicitly write 1349 /// through the L1 cache. 1350 break; 1351 case SIAtomicScope::WORKGROUP: 1352 case SIAtomicScope::WAVEFRONT: 1353 case SIAtomicScope::SINGLETHREAD: 1354 // No cache to bypass. Store atomics implicitly write through the L1 1355 // cache. 1356 break; 1357 default: 1358 llvm_unreachable("Unsupported synchronization scope"); 1359 } 1360 } 1361 1362 /// The scratch address space does not need the global memory caches 1363 /// to be bypassed as all memory operations by the same thread are 1364 /// sequentially consistent, and no other thread can access scratch 1365 /// memory. 1366 1367 /// Other address spaces do not have a cache. 1368 1369 return Changed; 1370 } 1371 1372 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1373 const MachineBasicBlock::iterator &MI, 1374 SIAtomicScope Scope, 1375 SIAtomicAddrSpace AddrSpace) const { 1376 assert(MI->mayLoad() && MI->mayStore()); 1377 bool Changed = false; 1378 1379 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1380 switch (Scope) { 1381 case SIAtomicScope::SYSTEM: 1382 case SIAtomicScope::AGENT: 1383 /// Do not set glc for RMW atomic operations as they implicitly bypass 1384 /// the L1 cache, and the glc bit is instead used to indicate if they are 1385 /// return or no-return. 1386 break; 1387 case SIAtomicScope::WORKGROUP: 1388 case SIAtomicScope::WAVEFRONT: 1389 case SIAtomicScope::SINGLETHREAD: 1390 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1391 break; 1392 default: 1393 llvm_unreachable("Unsupported synchronization scope"); 1394 } 1395 } 1396 1397 return Changed; 1398 } 1399 1400 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1401 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1402 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1403 // Only handle load and store, not atomic read-modify-write insructions. The 1404 // latter use glc to indicate if the atomic returns a result and so must not 1405 // be used for cache control. 1406 assert(MI->mayLoad() ^ MI->mayStore()); 1407 1408 // Only update load and store, not LLVM IR atomic read-modify-write 1409 // instructions. The latter are always marked as volatile so cannot sensibly 1410 // handle it as do not want to pessimize all atomics. Also they do not support 1411 // the nontemporal attribute. 1412 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1413 1414 bool Changed = false; 1415 1416 if (IsVolatile) { 1417 // Set L1 cache policy to be MISS_EVICT for load instructions 1418 // and MISS_LRU for store instructions. 1419 // Note: there is no L2 cache bypass policy at the ISA level. 1420 if (Op == SIMemOp::LOAD) 1421 Changed |= enableGLCBit(MI); 1422 1423 // Ensure operation has completed at system scope to cause all volatile 1424 // operations to be visible outside the program in a global order. Do not 1425 // request cross address space as only the global address space can be 1426 // observable outside the program, so no need to cause a waitcnt for LDS 1427 // address space operations. 1428 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1429 Position::AFTER); 1430 1431 return Changed; 1432 } 1433 1434 if (IsNonTemporal) { 1435 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1436 // for both loads and stores, and the L2 cache policy to STREAM. 1437 Changed |= enableGLCBit(MI); 1438 Changed |= enableSLCBit(MI); 1439 return Changed; 1440 } 1441 1442 return Changed; 1443 } 1444 1445 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1446 SIAtomicScope Scope, 1447 SIAtomicAddrSpace AddrSpace, 1448 SIMemOp Op, 1449 bool IsCrossAddrSpaceOrdering, 1450 Position Pos) const { 1451 if (ST.isTgSplitEnabled()) { 1452 // In threadgroup split mode the waves of a work-group can be executing on 1453 // different CUs. Therefore need to wait for global or GDS memory operations 1454 // to complete to ensure they are visible to waves in the other CUs. 1455 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1456 // the same CU, so no need to wait for global memory as all waves in the 1457 // work-group access the same the L1, nor wait for GDS as access are ordered 1458 // on a CU. 1459 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1460 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1461 (Scope == SIAtomicScope::WORKGROUP)) { 1462 // Same as GFX7 using agent scope. 1463 Scope = SIAtomicScope::AGENT; 1464 } 1465 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1466 // LDS memory operations. 1467 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1468 } 1469 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1470 IsCrossAddrSpaceOrdering, Pos); 1471 } 1472 1473 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1474 SIAtomicScope Scope, 1475 SIAtomicAddrSpace AddrSpace, 1476 Position Pos) const { 1477 if (!InsertCacheInv) 1478 return false; 1479 1480 bool Changed = false; 1481 1482 MachineBasicBlock &MBB = *MI->getParent(); 1483 DebugLoc DL = MI->getDebugLoc(); 1484 1485 if (Pos == Position::AFTER) 1486 ++MI; 1487 1488 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1489 switch (Scope) { 1490 case SIAtomicScope::SYSTEM: 1491 // Ensures that following loads will not see stale remote VMEM data or 1492 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1493 // CC will never be stale due to the local memory probes. 1494 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1495 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1496 // hardware does not reorder memory operations by the same wave with 1497 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1498 // remove any cache lines of earlier writes by the same wave and ensures 1499 // later reads by the same wave will refetch the cache lines. 1500 Changed = true; 1501 break; 1502 case SIAtomicScope::AGENT: 1503 // Same as GFX7. 1504 break; 1505 case SIAtomicScope::WORKGROUP: 1506 // In threadgroup split mode the waves of a work-group can be executing on 1507 // different CUs. Therefore need to invalidate the L1 which is per CU. 1508 // Otherwise in non-threadgroup split mode all waves of a work-group are 1509 // on the same CU, and so the L1 does not need to be invalidated. 1510 if (ST.isTgSplitEnabled()) { 1511 // Same as GFX7 using agent scope. 1512 Scope = SIAtomicScope::AGENT; 1513 } 1514 break; 1515 case SIAtomicScope::WAVEFRONT: 1516 case SIAtomicScope::SINGLETHREAD: 1517 // Same as GFX7. 1518 break; 1519 default: 1520 llvm_unreachable("Unsupported synchronization scope"); 1521 } 1522 } 1523 1524 /// The scratch address space does not need the global memory cache 1525 /// to be flushed as all memory operations by the same thread are 1526 /// sequentially consistent, and no other thread can access scratch 1527 /// memory. 1528 1529 /// Other address spaces do not have a cache. 1530 1531 if (Pos == Position::AFTER) 1532 --MI; 1533 1534 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1535 1536 return Changed; 1537 } 1538 1539 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1540 SIAtomicScope Scope, 1541 SIAtomicAddrSpace AddrSpace, 1542 bool IsCrossAddrSpaceOrdering, 1543 Position Pos) const { 1544 bool Changed = false; 1545 1546 MachineBasicBlock &MBB = *MI->getParent(); 1547 const DebugLoc &DL = MI->getDebugLoc(); 1548 1549 if (Pos == Position::AFTER) 1550 ++MI; 1551 1552 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1553 switch (Scope) { 1554 case SIAtomicScope::SYSTEM: 1555 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1556 // hardware does not reorder memory operations by the same wave with 1557 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1558 // to initiate writeback of any dirty cache lines of earlier writes by the 1559 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1560 // writeback has completed. 1561 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1562 // Set SC bits to indicate system scope. 1563 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1564 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1565 // vmcnt(0)" needed by the "BUFFER_WBL2". 1566 Changed = true; 1567 break; 1568 case SIAtomicScope::AGENT: 1569 case SIAtomicScope::WORKGROUP: 1570 case SIAtomicScope::WAVEFRONT: 1571 case SIAtomicScope::SINGLETHREAD: 1572 // Same as GFX7. 1573 break; 1574 default: 1575 llvm_unreachable("Unsupported synchronization scope"); 1576 } 1577 } 1578 1579 if (Pos == Position::AFTER) 1580 --MI; 1581 1582 Changed |= 1583 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1584 IsCrossAddrSpaceOrdering, Pos); 1585 1586 return Changed; 1587 } 1588 1589 bool SIGfx940CacheControl::enableLoadCacheBypass( 1590 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1591 SIAtomicAddrSpace AddrSpace) const { 1592 assert(MI->mayLoad() && !MI->mayStore()); 1593 bool Changed = false; 1594 1595 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1596 switch (Scope) { 1597 case SIAtomicScope::SYSTEM: 1598 // Set SC bits to indicate system scope. 1599 Changed |= enableSC0Bit(MI); 1600 Changed |= enableSC1Bit(MI); 1601 break; 1602 case SIAtomicScope::AGENT: 1603 // Set SC bits to indicate agent scope. 1604 Changed |= enableSC1Bit(MI); 1605 break; 1606 case SIAtomicScope::WORKGROUP: 1607 // In threadgroup split mode the waves of a work-group can be executing on 1608 // different CUs. Therefore need to bypass the L1 which is per CU. 1609 // Otherwise in non-threadgroup split mode all waves of a work-group are 1610 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1611 // bits to indicate work-group scope will do this automatically. 1612 Changed |= enableSC0Bit(MI); 1613 break; 1614 case SIAtomicScope::WAVEFRONT: 1615 case SIAtomicScope::SINGLETHREAD: 1616 // Leave SC bits unset to indicate wavefront scope. 1617 break; 1618 default: 1619 llvm_unreachable("Unsupported synchronization scope"); 1620 } 1621 } 1622 1623 /// The scratch address space does not need the global memory caches 1624 /// to be bypassed as all memory operations by the same thread are 1625 /// sequentially consistent, and no other thread can access scratch 1626 /// memory. 1627 1628 /// Other address spaces do not have a cache. 1629 1630 return Changed; 1631 } 1632 1633 bool SIGfx940CacheControl::enableStoreCacheBypass( 1634 const MachineBasicBlock::iterator &MI, 1635 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1636 assert(!MI->mayLoad() && MI->mayStore()); 1637 bool Changed = false; 1638 1639 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1640 switch (Scope) { 1641 case SIAtomicScope::SYSTEM: 1642 // Set SC bits to indicate system scope. 1643 Changed |= enableSC0Bit(MI); 1644 Changed |= enableSC1Bit(MI); 1645 break; 1646 case SIAtomicScope::AGENT: 1647 // Set SC bits to indicate agent scope. 1648 Changed |= enableSC1Bit(MI); 1649 break; 1650 case SIAtomicScope::WORKGROUP: 1651 // Set SC bits to indicate workgroup scope. 1652 Changed |= enableSC0Bit(MI); 1653 break; 1654 case SIAtomicScope::WAVEFRONT: 1655 case SIAtomicScope::SINGLETHREAD: 1656 // Leave SC bits unset to indicate wavefront scope. 1657 break; 1658 default: 1659 llvm_unreachable("Unsupported synchronization scope"); 1660 } 1661 } 1662 1663 /// The scratch address space does not need the global memory caches 1664 /// to be bypassed as all memory operations by the same thread are 1665 /// sequentially consistent, and no other thread can access scratch 1666 /// memory. 1667 1668 /// Other address spaces do not have a cache. 1669 1670 return Changed; 1671 } 1672 1673 bool SIGfx940CacheControl::enableRMWCacheBypass( 1674 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1675 SIAtomicAddrSpace AddrSpace) const { 1676 assert(MI->mayLoad() && MI->mayStore()); 1677 bool Changed = false; 1678 1679 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1680 switch (Scope) { 1681 case SIAtomicScope::SYSTEM: 1682 // Set SC1 bit to indicate system scope. 1683 Changed |= enableSC1Bit(MI); 1684 break; 1685 case SIAtomicScope::AGENT: 1686 case SIAtomicScope::WORKGROUP: 1687 case SIAtomicScope::WAVEFRONT: 1688 case SIAtomicScope::SINGLETHREAD: 1689 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1690 // to indicate system or agent scope. The SC0 bit is used to indicate if 1691 // they are return or no-return. Leave SC1 bit unset to indicate agent 1692 // scope. 1693 break; 1694 default: 1695 llvm_unreachable("Unsupported synchronization scope"); 1696 } 1697 } 1698 1699 return Changed; 1700 } 1701 1702 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1703 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1704 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1705 // Only handle load and store, not atomic read-modify-write insructions. The 1706 // latter use glc to indicate if the atomic returns a result and so must not 1707 // be used for cache control. 1708 assert(MI->mayLoad() ^ MI->mayStore()); 1709 1710 // Only update load and store, not LLVM IR atomic read-modify-write 1711 // instructions. The latter are always marked as volatile so cannot sensibly 1712 // handle it as do not want to pessimize all atomics. Also they do not support 1713 // the nontemporal attribute. 1714 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1715 1716 bool Changed = false; 1717 1718 if (IsVolatile) { 1719 // Set SC bits to indicate system scope. 1720 Changed |= enableSC0Bit(MI); 1721 Changed |= enableSC1Bit(MI); 1722 1723 // Ensure operation has completed at system scope to cause all volatile 1724 // operations to be visible outside the program in a global order. Do not 1725 // request cross address space as only the global address space can be 1726 // observable outside the program, so no need to cause a waitcnt for LDS 1727 // address space operations. 1728 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1729 Position::AFTER); 1730 1731 return Changed; 1732 } 1733 1734 if (IsNonTemporal) { 1735 Changed |= enableNTBit(MI); 1736 return Changed; 1737 } 1738 1739 return Changed; 1740 } 1741 1742 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1743 SIAtomicScope Scope, 1744 SIAtomicAddrSpace AddrSpace, 1745 Position Pos) const { 1746 if (!InsertCacheInv) 1747 return false; 1748 1749 bool Changed = false; 1750 1751 MachineBasicBlock &MBB = *MI->getParent(); 1752 DebugLoc DL = MI->getDebugLoc(); 1753 1754 if (Pos == Position::AFTER) 1755 ++MI; 1756 1757 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1758 switch (Scope) { 1759 case SIAtomicScope::SYSTEM: 1760 // Ensures that following loads will not see stale remote VMEM data or 1761 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1762 // CC will never be stale due to the local memory probes. 1763 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1764 // Set SC bits to indicate system scope. 1765 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1766 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1767 // hardware does not reorder memory operations by the same wave with 1768 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1769 // remove any cache lines of earlier writes by the same wave and ensures 1770 // later reads by the same wave will refetch the cache lines. 1771 Changed = true; 1772 break; 1773 case SIAtomicScope::AGENT: 1774 // Ensures that following loads will not see stale remote date or local 1775 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1776 // due to the memory probes. 1777 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1778 // Set SC bits to indicate agent scope. 1779 .addImm(AMDGPU::CPol::SC1); 1780 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1781 // does not reorder memory operations with respect to preceeding buffer 1782 // invalidate. The invalidate is guaranteed to remove any cache lines of 1783 // earlier writes and ensures later writes will refetch the cache lines. 1784 Changed = true; 1785 break; 1786 case SIAtomicScope::WORKGROUP: 1787 // In threadgroup split mode the waves of a work-group can be executing on 1788 // different CUs. Therefore need to invalidate the L1 which is per CU. 1789 // Otherwise in non-threadgroup split mode all waves of a work-group are 1790 // on the same CU, and so the L1 does not need to be invalidated. 1791 if (ST.isTgSplitEnabled()) { 1792 // Ensures L1 is invalidated if in threadgroup split mode. In 1793 // non-threadgroup split mode it is a NOP, but no point generating it in 1794 // that case if know not in that mode. 1795 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1796 // Set SC bits to indicate work-group scope. 1797 .addImm(AMDGPU::CPol::SC0); 1798 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1799 // does not reorder memory operations with respect to preceeding buffer 1800 // invalidate. The invalidate is guaranteed to remove any cache lines of 1801 // earlier writes and ensures later writes will refetch the cache lines. 1802 Changed = true; 1803 } 1804 break; 1805 case SIAtomicScope::WAVEFRONT: 1806 case SIAtomicScope::SINGLETHREAD: 1807 // Could generate "BUFFER_INV" but it would do nothing as there are no 1808 // caches to invalidate. 1809 break; 1810 default: 1811 llvm_unreachable("Unsupported synchronization scope"); 1812 } 1813 } 1814 1815 /// The scratch address space does not need the global memory cache 1816 /// to be flushed as all memory operations by the same thread are 1817 /// sequentially consistent, and no other thread can access scratch 1818 /// memory. 1819 1820 /// Other address spaces do not have a cache. 1821 1822 if (Pos == Position::AFTER) 1823 --MI; 1824 1825 return Changed; 1826 } 1827 1828 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1829 SIAtomicScope Scope, 1830 SIAtomicAddrSpace AddrSpace, 1831 bool IsCrossAddrSpaceOrdering, 1832 Position Pos) const { 1833 bool Changed = false; 1834 1835 MachineBasicBlock &MBB = *MI->getParent(); 1836 DebugLoc DL = MI->getDebugLoc(); 1837 1838 if (Pos == Position::AFTER) 1839 ++MI; 1840 1841 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1842 switch (Scope) { 1843 case SIAtomicScope::SYSTEM: 1844 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1845 // hardware does not reorder memory operations by the same wave with 1846 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1847 // to initiate writeback of any dirty cache lines of earlier writes by the 1848 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1849 // writeback has completed. 1850 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1851 // Set SC bits to indicate system scope. 1852 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1853 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1854 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1855 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1856 Changed = true; 1857 break; 1858 case SIAtomicScope::AGENT: 1859 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1860 // Set SC bits to indicate agent scope. 1861 .addImm(AMDGPU::CPol::SC1); 1862 1863 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1864 // SIAtomicScope::AGENT, the following insertWait will generate the 1865 // required "S_WAITCNT vmcnt(0)". 1866 Changed = true; 1867 break; 1868 case SIAtomicScope::WORKGROUP: 1869 case SIAtomicScope::WAVEFRONT: 1870 case SIAtomicScope::SINGLETHREAD: 1871 // Do not generate "BUFFER_WBL2" as there are no caches it would 1872 // writeback, and would require an otherwise unnecessary 1873 // "S_WAITCNT vmcnt(0)". 1874 break; 1875 default: 1876 llvm_unreachable("Unsupported synchronization scope"); 1877 } 1878 } 1879 1880 if (Pos == Position::AFTER) 1881 --MI; 1882 1883 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1884 // S_WAITCNT needed. 1885 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1886 IsCrossAddrSpaceOrdering, Pos); 1887 1888 return Changed; 1889 } 1890 1891 bool SIGfx10CacheControl::enableLoadCacheBypass( 1892 const MachineBasicBlock::iterator &MI, 1893 SIAtomicScope Scope, 1894 SIAtomicAddrSpace AddrSpace) const { 1895 assert(MI->mayLoad() && !MI->mayStore()); 1896 bool Changed = false; 1897 1898 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1899 switch (Scope) { 1900 case SIAtomicScope::SYSTEM: 1901 case SIAtomicScope::AGENT: 1902 // Set the L0 and L1 cache policies to MISS_EVICT. 1903 // Note: there is no L2 cache coherent bypass control at the ISA level. 1904 Changed |= enableGLCBit(MI); 1905 Changed |= enableDLCBit(MI); 1906 break; 1907 case SIAtomicScope::WORKGROUP: 1908 // In WGP mode the waves of a work-group can be executing on either CU of 1909 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1910 // CU mode all waves of a work-group are on the same CU, and so the L0 1911 // does not need to be bypassed. 1912 if (!ST.isCuModeEnabled()) 1913 Changed |= enableGLCBit(MI); 1914 break; 1915 case SIAtomicScope::WAVEFRONT: 1916 case SIAtomicScope::SINGLETHREAD: 1917 // No cache to bypass. 1918 break; 1919 default: 1920 llvm_unreachable("Unsupported synchronization scope"); 1921 } 1922 } 1923 1924 /// The scratch address space does not need the global memory caches 1925 /// to be bypassed as all memory operations by the same thread are 1926 /// sequentially consistent, and no other thread can access scratch 1927 /// memory. 1928 1929 /// Other address spaces do not have a cache. 1930 1931 return Changed; 1932 } 1933 1934 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1935 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1936 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1937 1938 // Only handle load and store, not atomic read-modify-write insructions. The 1939 // latter use glc to indicate if the atomic returns a result and so must not 1940 // be used for cache control. 1941 assert(MI->mayLoad() ^ MI->mayStore()); 1942 1943 // Only update load and store, not LLVM IR atomic read-modify-write 1944 // instructions. The latter are always marked as volatile so cannot sensibly 1945 // handle it as do not want to pessimize all atomics. Also they do not support 1946 // the nontemporal attribute. 1947 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1948 1949 bool Changed = false; 1950 1951 if (IsVolatile) { 1952 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1953 // and MISS_LRU for store instructions. 1954 // Note: there is no L2 cache coherent bypass control at the ISA level. 1955 if (Op == SIMemOp::LOAD) { 1956 Changed |= enableGLCBit(MI); 1957 Changed |= enableDLCBit(MI); 1958 } 1959 1960 // Ensure operation has completed at system scope to cause all volatile 1961 // operations to be visible outside the program in a global order. Do not 1962 // request cross address space as only the global address space can be 1963 // observable outside the program, so no need to cause a waitcnt for LDS 1964 // address space operations. 1965 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1966 Position::AFTER); 1967 return Changed; 1968 } 1969 1970 if (IsNonTemporal) { 1971 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1972 // and L2 cache policy to STREAM. 1973 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1974 // to MISS_EVICT and the L2 cache policy to STREAM. 1975 if (Op == SIMemOp::STORE) 1976 Changed |= enableGLCBit(MI); 1977 Changed |= enableSLCBit(MI); 1978 1979 return Changed; 1980 } 1981 1982 return Changed; 1983 } 1984 1985 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1986 SIAtomicScope Scope, 1987 SIAtomicAddrSpace AddrSpace, 1988 SIMemOp Op, 1989 bool IsCrossAddrSpaceOrdering, 1990 Position Pos) const { 1991 bool Changed = false; 1992 1993 MachineBasicBlock &MBB = *MI->getParent(); 1994 DebugLoc DL = MI->getDebugLoc(); 1995 1996 if (Pos == Position::AFTER) 1997 ++MI; 1998 1999 bool VMCnt = false; 2000 bool VSCnt = false; 2001 bool LGKMCnt = false; 2002 2003 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2004 SIAtomicAddrSpace::NONE) { 2005 switch (Scope) { 2006 case SIAtomicScope::SYSTEM: 2007 case SIAtomicScope::AGENT: 2008 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2009 VMCnt |= true; 2010 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2011 VSCnt |= true; 2012 break; 2013 case SIAtomicScope::WORKGROUP: 2014 // In WGP mode the waves of a work-group can be executing on either CU of 2015 // the WGP. Therefore need to wait for operations to complete to ensure 2016 // they are visible to waves in the other CU as the L0 is per CU. 2017 // Otherwise in CU mode and all waves of a work-group are on the same CU 2018 // which shares the same L0. 2019 if (!ST.isCuModeEnabled()) { 2020 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2021 VMCnt |= true; 2022 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2023 VSCnt |= true; 2024 } 2025 break; 2026 case SIAtomicScope::WAVEFRONT: 2027 case SIAtomicScope::SINGLETHREAD: 2028 // The L0 cache keeps all memory operations in order for 2029 // work-items in the same wavefront. 2030 break; 2031 default: 2032 llvm_unreachable("Unsupported synchronization scope"); 2033 } 2034 } 2035 2036 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2037 switch (Scope) { 2038 case SIAtomicScope::SYSTEM: 2039 case SIAtomicScope::AGENT: 2040 case SIAtomicScope::WORKGROUP: 2041 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2042 // not needed as LDS operations for all waves are executed in a total 2043 // global ordering as observed by all waves. Required if also 2044 // synchronizing with global/GDS memory as LDS operations could be 2045 // reordered with respect to later global/GDS memory operations of the 2046 // same wave. 2047 LGKMCnt |= IsCrossAddrSpaceOrdering; 2048 break; 2049 case SIAtomicScope::WAVEFRONT: 2050 case SIAtomicScope::SINGLETHREAD: 2051 // The LDS keeps all memory operations in order for 2052 // the same wavefront. 2053 break; 2054 default: 2055 llvm_unreachable("Unsupported synchronization scope"); 2056 } 2057 } 2058 2059 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 2060 switch (Scope) { 2061 case SIAtomicScope::SYSTEM: 2062 case SIAtomicScope::AGENT: 2063 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 2064 // is not needed as GDS operations for all waves are executed in a total 2065 // global ordering as observed by all waves. Required if also 2066 // synchronizing with global/LDS memory as GDS operations could be 2067 // reordered with respect to later global/LDS memory operations of the 2068 // same wave. 2069 LGKMCnt |= IsCrossAddrSpaceOrdering; 2070 break; 2071 case SIAtomicScope::WORKGROUP: 2072 case SIAtomicScope::WAVEFRONT: 2073 case SIAtomicScope::SINGLETHREAD: 2074 // The GDS keeps all memory operations in order for 2075 // the same work-group. 2076 break; 2077 default: 2078 llvm_unreachable("Unsupported synchronization scope"); 2079 } 2080 } 2081 2082 if (VMCnt || LGKMCnt) { 2083 unsigned WaitCntImmediate = 2084 AMDGPU::encodeWaitcnt(IV, 2085 VMCnt ? 0 : getVmcntBitMask(IV), 2086 getExpcntBitMask(IV), 2087 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 2088 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 2089 .addImm(WaitCntImmediate); 2090 Changed = true; 2091 } 2092 2093 if (VSCnt) { 2094 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 2095 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 2096 .addImm(0); 2097 Changed = true; 2098 } 2099 2100 if (Pos == Position::AFTER) 2101 --MI; 2102 2103 return Changed; 2104 } 2105 2106 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2107 SIAtomicScope Scope, 2108 SIAtomicAddrSpace AddrSpace, 2109 Position Pos) const { 2110 if (!InsertCacheInv) 2111 return false; 2112 2113 bool Changed = false; 2114 2115 MachineBasicBlock &MBB = *MI->getParent(); 2116 DebugLoc DL = MI->getDebugLoc(); 2117 2118 if (Pos == Position::AFTER) 2119 ++MI; 2120 2121 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2122 switch (Scope) { 2123 case SIAtomicScope::SYSTEM: 2124 case SIAtomicScope::AGENT: 2125 // The order of invalidates matter here. We must invalidate "outer in" 2126 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is 2127 // invalidated. 2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2129 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2130 Changed = true; 2131 break; 2132 case SIAtomicScope::WORKGROUP: 2133 // In WGP mode the waves of a work-group can be executing on either CU of 2134 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2135 // in CU mode and all waves of a work-group are on the same CU, and so the 2136 // L0 does not need to be invalidated. 2137 if (!ST.isCuModeEnabled()) { 2138 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2139 Changed = true; 2140 } 2141 break; 2142 case SIAtomicScope::WAVEFRONT: 2143 case SIAtomicScope::SINGLETHREAD: 2144 // No cache to invalidate. 2145 break; 2146 default: 2147 llvm_unreachable("Unsupported synchronization scope"); 2148 } 2149 } 2150 2151 /// The scratch address space does not need the global memory cache 2152 /// to be flushed as all memory operations by the same thread are 2153 /// sequentially consistent, and no other thread can access scratch 2154 /// memory. 2155 2156 /// Other address spaces do not have a cache. 2157 2158 if (Pos == Position::AFTER) 2159 --MI; 2160 2161 return Changed; 2162 } 2163 2164 bool SIGfx11CacheControl::enableLoadCacheBypass( 2165 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2166 SIAtomicAddrSpace AddrSpace) const { 2167 assert(MI->mayLoad() && !MI->mayStore()); 2168 bool Changed = false; 2169 2170 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2171 switch (Scope) { 2172 case SIAtomicScope::SYSTEM: 2173 case SIAtomicScope::AGENT: 2174 // Set the L0 and L1 cache policies to MISS_EVICT. 2175 // Note: there is no L2 cache coherent bypass control at the ISA level. 2176 Changed |= enableGLCBit(MI); 2177 break; 2178 case SIAtomicScope::WORKGROUP: 2179 // In WGP mode the waves of a work-group can be executing on either CU of 2180 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2181 // CU mode all waves of a work-group are on the same CU, and so the L0 2182 // does not need to be bypassed. 2183 if (!ST.isCuModeEnabled()) 2184 Changed |= enableGLCBit(MI); 2185 break; 2186 case SIAtomicScope::WAVEFRONT: 2187 case SIAtomicScope::SINGLETHREAD: 2188 // No cache to bypass. 2189 break; 2190 default: 2191 llvm_unreachable("Unsupported synchronization scope"); 2192 } 2193 } 2194 2195 /// The scratch address space does not need the global memory caches 2196 /// to be bypassed as all memory operations by the same thread are 2197 /// sequentially consistent, and no other thread can access scratch 2198 /// memory. 2199 2200 /// Other address spaces do not have a cache. 2201 2202 return Changed; 2203 } 2204 2205 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2206 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2207 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2208 2209 // Only handle load and store, not atomic read-modify-write insructions. The 2210 // latter use glc to indicate if the atomic returns a result and so must not 2211 // be used for cache control. 2212 assert(MI->mayLoad() ^ MI->mayStore()); 2213 2214 // Only update load and store, not LLVM IR atomic read-modify-write 2215 // instructions. The latter are always marked as volatile so cannot sensibly 2216 // handle it as do not want to pessimize all atomics. Also they do not support 2217 // the nontemporal attribute. 2218 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2219 2220 bool Changed = false; 2221 2222 if (IsVolatile) { 2223 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2224 // and MISS_LRU for store instructions. 2225 // Note: there is no L2 cache coherent bypass control at the ISA level. 2226 if (Op == SIMemOp::LOAD) 2227 Changed |= enableGLCBit(MI); 2228 2229 // Set MALL NOALLOC for load and store instructions. 2230 Changed |= enableDLCBit(MI); 2231 2232 // Ensure operation has completed at system scope to cause all volatile 2233 // operations to be visible outside the program in a global order. Do not 2234 // request cross address space as only the global address space can be 2235 // observable outside the program, so no need to cause a waitcnt for LDS 2236 // address space operations. 2237 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2238 Position::AFTER); 2239 return Changed; 2240 } 2241 2242 if (IsNonTemporal) { 2243 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2244 // and L2 cache policy to STREAM. 2245 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2246 // to MISS_EVICT and the L2 cache policy to STREAM. 2247 if (Op == SIMemOp::STORE) 2248 Changed |= enableGLCBit(MI); 2249 Changed |= enableSLCBit(MI); 2250 2251 // Set MALL NOALLOC for load and store instructions. 2252 Changed |= enableDLCBit(MI); 2253 return Changed; 2254 } 2255 2256 return Changed; 2257 } 2258 2259 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 2260 AMDGPU::CPol::CPol Value) const { 2261 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2262 if (!CPol) 2263 return false; 2264 2265 uint64_t NewTH = Value & AMDGPU::CPol::TH; 2266 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 2267 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 2268 return true; 2269 } 2270 2271 return false; 2272 } 2273 2274 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 2275 AMDGPU::CPol::CPol Value) const { 2276 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2277 if (!CPol) 2278 return false; 2279 2280 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 2281 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 2282 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 2283 return true; 2284 } 2285 2286 return false; 2287 } 2288 2289 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( 2290 const MachineBasicBlock::iterator MI) const { 2291 // TODO: implement flag for frontend to give us a hint not to insert waits. 2292 2293 MachineBasicBlock &MBB = *MI->getParent(); 2294 const DebugLoc &DL = MI->getDebugLoc(); 2295 2296 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); 2297 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); 2298 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); 2299 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); 2300 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); 2301 2302 return true; 2303 } 2304 2305 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 2306 SIAtomicScope Scope, 2307 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2308 bool IsCrossAddrSpaceOrdering, 2309 Position Pos) const { 2310 bool Changed = false; 2311 2312 MachineBasicBlock &MBB = *MI->getParent(); 2313 DebugLoc DL = MI->getDebugLoc(); 2314 2315 bool LOADCnt = false; 2316 bool DSCnt = false; 2317 bool STORECnt = false; 2318 2319 if (Pos == Position::AFTER) 2320 ++MI; 2321 2322 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2323 SIAtomicAddrSpace::NONE) { 2324 switch (Scope) { 2325 case SIAtomicScope::SYSTEM: 2326 case SIAtomicScope::AGENT: 2327 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2328 LOADCnt |= true; 2329 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2330 STORECnt |= true; 2331 break; 2332 case SIAtomicScope::WORKGROUP: 2333 // In WGP mode the waves of a work-group can be executing on either CU of 2334 // the WGP. Therefore need to wait for operations to complete to ensure 2335 // they are visible to waves in the other CU as the L0 is per CU. 2336 // Otherwise in CU mode and all waves of a work-group are on the same CU 2337 // which shares the same L0. 2338 if (!ST.isCuModeEnabled()) { 2339 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2340 LOADCnt |= true; 2341 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2342 STORECnt |= true; 2343 } 2344 break; 2345 case SIAtomicScope::WAVEFRONT: 2346 case SIAtomicScope::SINGLETHREAD: 2347 // The L0 cache keeps all memory operations in order for 2348 // work-items in the same wavefront. 2349 break; 2350 default: 2351 llvm_unreachable("Unsupported synchronization scope"); 2352 } 2353 } 2354 2355 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2356 switch (Scope) { 2357 case SIAtomicScope::SYSTEM: 2358 case SIAtomicScope::AGENT: 2359 case SIAtomicScope::WORKGROUP: 2360 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2361 // not needed as LDS operations for all waves are executed in a total 2362 // global ordering as observed by all waves. Required if also 2363 // synchronizing with global/GDS memory as LDS operations could be 2364 // reordered with respect to later global/GDS memory operations of the 2365 // same wave. 2366 DSCnt |= IsCrossAddrSpaceOrdering; 2367 break; 2368 case SIAtomicScope::WAVEFRONT: 2369 case SIAtomicScope::SINGLETHREAD: 2370 // The LDS keeps all memory operations in order for 2371 // the same wavefront. 2372 break; 2373 default: 2374 llvm_unreachable("Unsupported synchronization scope"); 2375 } 2376 } 2377 2378 if (LOADCnt) { 2379 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 2380 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 2381 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 2382 Changed = true; 2383 } 2384 2385 if (STORECnt) { 2386 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 2387 Changed = true; 2388 } 2389 2390 if (DSCnt) { 2391 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 2392 Changed = true; 2393 } 2394 2395 if (Pos == Position::AFTER) 2396 --MI; 2397 2398 return Changed; 2399 } 2400 2401 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2402 SIAtomicScope Scope, 2403 SIAtomicAddrSpace AddrSpace, 2404 Position Pos) const { 2405 if (!InsertCacheInv) 2406 return false; 2407 2408 MachineBasicBlock &MBB = *MI->getParent(); 2409 DebugLoc DL = MI->getDebugLoc(); 2410 2411 /// The scratch address space does not need the global memory cache 2412 /// to be flushed as all memory operations by the same thread are 2413 /// sequentially consistent, and no other thread can access scratch 2414 /// memory. 2415 2416 /// Other address spaces do not have a cache. 2417 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2418 return false; 2419 2420 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2421 switch (Scope) { 2422 case SIAtomicScope::SYSTEM: 2423 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2424 break; 2425 case SIAtomicScope::AGENT: 2426 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2427 break; 2428 case SIAtomicScope::WORKGROUP: 2429 // In WGP mode the waves of a work-group can be executing on either CU of 2430 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2431 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2432 // the L0 does not need to be invalidated. 2433 if (ST.isCuModeEnabled()) 2434 return false; 2435 2436 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2437 break; 2438 case SIAtomicScope::WAVEFRONT: 2439 case SIAtomicScope::SINGLETHREAD: 2440 // No cache to invalidate. 2441 return false; 2442 default: 2443 llvm_unreachable("Unsupported synchronization scope"); 2444 } 2445 2446 if (Pos == Position::AFTER) 2447 ++MI; 2448 2449 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2450 2451 if (Pos == Position::AFTER) 2452 --MI; 2453 2454 return true; 2455 } 2456 2457 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 2458 SIAtomicScope Scope, 2459 SIAtomicAddrSpace AddrSpace, 2460 bool IsCrossAddrSpaceOrdering, 2461 Position Pos) const { 2462 MachineBasicBlock &MBB = *MI->getParent(); 2463 DebugLoc DL = MI->getDebugLoc(); 2464 2465 // The scratch address space does not need the global memory cache 2466 // writeback as all memory operations by the same thread are 2467 // sequentially consistent, and no other thread can access scratch 2468 // memory. 2469 2470 // Other address spaces do not have a cache. 2471 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2472 return false; 2473 2474 if (Pos == Position::AFTER) 2475 ++MI; 2476 2477 // GLOBAL_WB is always needed, even for write-through caches, as it 2478 // additionally ensures all operations have reached the desired cache level. 2479 bool SkipWB = false; 2480 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2481 switch (Scope) { 2482 case SIAtomicScope::SYSTEM: 2483 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2484 break; 2485 case SIAtomicScope::AGENT: 2486 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2487 break; 2488 case SIAtomicScope::WORKGROUP: 2489 // In WGP mode the waves of a work-group can be executing on either CU of 2490 // the WGP. Therefore we need to ensure all operations have reached L1, 2491 // hence the SCOPE_SE WB. 2492 // For CU mode, we need operations to reach L0, so the wait is enough - 2493 // there are no ways for an operation to report completion without reaching 2494 // at least L0. 2495 if (ST.isCuModeEnabled()) 2496 SkipWB = true; 2497 else 2498 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2499 break; 2500 case SIAtomicScope::WAVEFRONT: 2501 case SIAtomicScope::SINGLETHREAD: 2502 // No cache to invalidate. 2503 return false; 2504 default: 2505 llvm_unreachable("Unsupported synchronization scope"); 2506 } 2507 2508 if (!SkipWB) 2509 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm); 2510 2511 if (Pos == Position::AFTER) 2512 --MI; 2513 2514 // We always have to wait for previous memory operations (load/store) to 2515 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), 2516 // we of course need to wait for that as well. 2517 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2518 IsCrossAddrSpaceOrdering, Pos); 2519 2520 return true; 2521 } 2522 2523 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 2524 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2525 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2526 2527 // Only handle load and store, not atomic read-modify-write instructions. 2528 assert(MI->mayLoad() ^ MI->mayStore()); 2529 2530 // Only update load and store, not LLVM IR atomic read-modify-write 2531 // instructions. The latter are always marked as volatile so cannot sensibly 2532 // handle it as do not want to pessimize all atomics. Also they do not support 2533 // the nontemporal attribute. 2534 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2535 2536 bool Changed = false; 2537 2538 if (IsLastUse) { 2539 // Set last-use hint. 2540 Changed |= setTH(MI, AMDGPU::CPol::TH_LU); 2541 } else if (IsNonTemporal) { 2542 // Set non-temporal hint for all cache levels. 2543 Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 2544 } 2545 2546 if (IsVolatile) { 2547 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2548 2549 if (Op == SIMemOp::STORE) 2550 Changed |= insertWaitsBeforeSystemScopeStore(MI); 2551 2552 // Ensure operation has completed at system scope to cause all volatile 2553 // operations to be visible outside the program in a global order. Do not 2554 // request cross address space as only the global address space can be 2555 // observable outside the program, so no need to cause a waitcnt for LDS 2556 // address space operations. 2557 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2558 Position::AFTER); 2559 } 2560 2561 return Changed; 2562 } 2563 2564 bool SIGfx12CacheControl::expandSystemScopeStore( 2565 MachineBasicBlock::iterator &MI) const { 2566 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2567 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) 2568 return insertWaitsBeforeSystemScopeStore(MI); 2569 2570 return false; 2571 } 2572 2573 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, 2574 SIAtomicScope Scope, 2575 SIAtomicAddrSpace AddrSpace) const { 2576 bool Changed = false; 2577 2578 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2579 switch (Scope) { 2580 case SIAtomicScope::SYSTEM: 2581 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2582 break; 2583 case SIAtomicScope::AGENT: 2584 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); 2585 break; 2586 case SIAtomicScope::WORKGROUP: 2587 // In workgroup mode, SCOPE_SE is needed as waves can executes on 2588 // different CUs that access different L0s. 2589 if (!ST.isCuModeEnabled()) 2590 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); 2591 break; 2592 case SIAtomicScope::WAVEFRONT: 2593 case SIAtomicScope::SINGLETHREAD: 2594 // No cache to bypass. 2595 break; 2596 default: 2597 llvm_unreachable("Unsupported synchronization scope"); 2598 } 2599 } 2600 2601 // The scratch address space does not need the global memory caches 2602 // to be bypassed as all memory operations by the same thread are 2603 // sequentially consistent, and no other thread can access scratch 2604 // memory. 2605 2606 // Other address spaces do not have a cache. 2607 2608 return Changed; 2609 } 2610 2611 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2612 if (AtomicPseudoMIs.empty()) 2613 return false; 2614 2615 for (auto &MI : AtomicPseudoMIs) 2616 MI->eraseFromParent(); 2617 2618 AtomicPseudoMIs.clear(); 2619 return true; 2620 } 2621 2622 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2623 MachineBasicBlock::iterator &MI) { 2624 assert(MI->mayLoad() && !MI->mayStore()); 2625 2626 bool Changed = false; 2627 2628 if (MOI.isAtomic()) { 2629 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2630 MOI.getOrdering() == AtomicOrdering::Acquire || 2631 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2632 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2633 MOI.getOrderingAddrSpace()); 2634 } 2635 2636 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2637 Changed |= CC->insertWait(MI, MOI.getScope(), 2638 MOI.getOrderingAddrSpace(), 2639 SIMemOp::LOAD | SIMemOp::STORE, 2640 MOI.getIsCrossAddressSpaceOrdering(), 2641 Position::BEFORE); 2642 2643 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2644 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2645 Changed |= CC->insertWait(MI, MOI.getScope(), 2646 MOI.getInstrAddrSpace(), 2647 SIMemOp::LOAD, 2648 MOI.getIsCrossAddressSpaceOrdering(), 2649 Position::AFTER); 2650 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2651 MOI.getOrderingAddrSpace(), 2652 Position::AFTER); 2653 } 2654 2655 return Changed; 2656 } 2657 2658 // Atomic instructions already bypass caches to the scope specified by the 2659 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use 2660 // instructions need additional treatment. 2661 Changed |= CC->enableVolatileAndOrNonTemporal( 2662 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), 2663 MOI.isNonTemporal(), MOI.isLastUse()); 2664 2665 return Changed; 2666 } 2667 2668 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2669 MachineBasicBlock::iterator &MI) { 2670 assert(!MI->mayLoad() && MI->mayStore()); 2671 2672 bool Changed = false; 2673 2674 if (MOI.isAtomic()) { 2675 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2676 MOI.getOrdering() == AtomicOrdering::Release || 2677 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2678 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2679 MOI.getOrderingAddrSpace()); 2680 } 2681 2682 if (MOI.getOrdering() == AtomicOrdering::Release || 2683 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2684 Changed |= CC->insertRelease(MI, MOI.getScope(), 2685 MOI.getOrderingAddrSpace(), 2686 MOI.getIsCrossAddressSpaceOrdering(), 2687 Position::BEFORE); 2688 2689 return Changed; 2690 } 2691 2692 // Atomic instructions already bypass caches to the scope specified by the 2693 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2694 // need additional treatment. 2695 Changed |= CC->enableVolatileAndOrNonTemporal( 2696 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2697 MOI.isNonTemporal()); 2698 2699 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is 2700 // instruction field, do not confuse it with atomic scope. 2701 Changed |= CC->expandSystemScopeStore(MI); 2702 return Changed; 2703 } 2704 2705 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2706 MachineBasicBlock::iterator &MI) { 2707 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2708 2709 AtomicPseudoMIs.push_back(MI); 2710 bool Changed = false; 2711 2712 // Refine fenced address space based on MMRAs. 2713 // 2714 // TODO: Should we support this MMRA on other atomic operations? 2715 auto OrderingAddrSpace = 2716 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); 2717 2718 if (MOI.isAtomic()) { 2719 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2720 Changed |= CC->insertWait( 2721 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2722 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); 2723 2724 if (MOI.getOrdering() == AtomicOrdering::Release || 2725 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2726 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2727 /// TODO: This relies on a barrier always generating a waitcnt 2728 /// for LDS to ensure it is not reordered with the completion of 2729 /// the proceeding LDS operations. If barrier had a memory 2730 /// ordering and memory scope, then library does not need to 2731 /// generate a fence. Could add support in this file for 2732 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2733 /// adding S_WAITCNT before a S_BARRIER. 2734 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, 2735 MOI.getIsCrossAddressSpaceOrdering(), 2736 Position::BEFORE); 2737 2738 // TODO: If both release and invalidate are happening they could be combined 2739 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2740 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2741 // track cache invalidate and write back instructions. 2742 2743 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2744 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2745 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2746 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, 2747 Position::BEFORE); 2748 2749 return Changed; 2750 } 2751 2752 return Changed; 2753 } 2754 2755 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2756 MachineBasicBlock::iterator &MI) { 2757 assert(MI->mayLoad() && MI->mayStore()); 2758 2759 bool Changed = false; 2760 2761 if (MOI.isAtomic()) { 2762 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2763 MOI.getOrdering() == AtomicOrdering::Acquire || 2764 MOI.getOrdering() == AtomicOrdering::Release || 2765 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2766 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2767 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2768 MOI.getInstrAddrSpace()); 2769 } 2770 2771 if (MOI.getOrdering() == AtomicOrdering::Release || 2772 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2773 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2774 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2775 Changed |= CC->insertRelease(MI, MOI.getScope(), 2776 MOI.getOrderingAddrSpace(), 2777 MOI.getIsCrossAddressSpaceOrdering(), 2778 Position::BEFORE); 2779 2780 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2781 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2782 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2783 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2784 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2785 Changed |= CC->insertWait(MI, MOI.getScope(), 2786 MOI.getInstrAddrSpace(), 2787 isAtomicRet(*MI) ? SIMemOp::LOAD : 2788 SIMemOp::STORE, 2789 MOI.getIsCrossAddressSpaceOrdering(), 2790 Position::AFTER); 2791 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2792 MOI.getOrderingAddrSpace(), 2793 Position::AFTER); 2794 } 2795 2796 return Changed; 2797 } 2798 2799 return Changed; 2800 } 2801 2802 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2803 bool Changed = false; 2804 2805 SIMemOpAccess MOA(MF); 2806 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2807 2808 for (auto &MBB : MF) { 2809 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2810 2811 // Unbundle instructions after the post-RA scheduler. 2812 if (MI->isBundle() && MI->mayLoadOrStore()) { 2813 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2814 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2815 I != E && I->isBundledWithPred(); ++I) { 2816 I->unbundleFromPred(); 2817 for (MachineOperand &MO : I->operands()) 2818 if (MO.isReg()) 2819 MO.setIsInternalRead(false); 2820 } 2821 2822 MI->eraseFromParent(); 2823 MI = II->getIterator(); 2824 } 2825 2826 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2827 continue; 2828 2829 if (const auto &MOI = MOA.getLoadInfo(MI)) 2830 Changed |= expandLoad(*MOI, MI); 2831 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2832 Changed |= expandStore(*MOI, MI); 2833 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2834 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2835 Changed |= expandAtomicFence(*MOI, MI); 2836 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2837 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2838 } 2839 } 2840 2841 Changed |= removeAtomicPseudoMIs(); 2842 return Changed; 2843 } 2844 2845 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2846 2847 char SIMemoryLegalizer::ID = 0; 2848 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2849 2850 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2851 return new SIMemoryLegalizer(); 2852 } 2853