1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/CodeGen/MachineBasicBlock.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h" 26 #include "llvm/Support/AtomicOrdering.h" 27 #include "llvm/TargetParser/TargetParser.h" 28 29 using namespace llvm; 30 using namespace llvm::AMDGPU; 31 32 #define DEBUG_TYPE "si-memory-legalizer" 33 #define PASS_NAME "SI Memory Legalizer" 34 35 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 37 cl::desc("Use this to skip inserting cache invalidating instructions.")); 38 39 namespace { 40 41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 42 43 /// Memory operation flags. Can be ORed together. 44 enum class SIMemOp { 45 NONE = 0u, 46 LOAD = 1u << 0, 47 STORE = 1u << 1, 48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 49 }; 50 51 /// Position to insert a new instruction relative to an existing 52 /// instruction. 53 enum class Position { 54 BEFORE, 55 AFTER 56 }; 57 58 /// The atomic synchronization scopes supported by the AMDGPU target. 59 enum class SIAtomicScope { 60 NONE, 61 SINGLETHREAD, 62 WAVEFRONT, 63 WORKGROUP, 64 AGENT, 65 SYSTEM 66 }; 67 68 /// The distinct address spaces supported by the AMDGPU target for 69 /// atomic memory operation. Can be ORed together. 70 enum class SIAtomicAddrSpace { 71 NONE = 0u, 72 GLOBAL = 1u << 0, 73 LDS = 1u << 1, 74 SCRATCH = 1u << 2, 75 GDS = 1u << 3, 76 OTHER = 1u << 4, 77 78 /// The address spaces that can be accessed by a FLAT instruction. 79 FLAT = GLOBAL | LDS | SCRATCH, 80 81 /// The address spaces that support atomic instructions. 82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 83 84 /// All address spaces. 85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 86 87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 88 }; 89 90 class SIMemOpInfo final { 91 private: 92 93 friend class SIMemOpAccess; 94 95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 97 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 100 bool IsCrossAddressSpaceOrdering = false; 101 bool IsVolatile = false; 102 bool IsNonTemporal = false; 103 bool IsLastUse = false; 104 105 SIMemOpInfo( 106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 107 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 110 bool IsCrossAddressSpaceOrdering = true, 111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, 112 bool IsVolatile = false, bool IsNonTemporal = false, 113 bool IsLastUse = false) 114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), 115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), 116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), 118 IsLastUse(IsLastUse) { 119 120 if (Ordering == AtomicOrdering::NotAtomic) { 121 assert(Scope == SIAtomicScope::NONE && 122 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 123 !IsCrossAddressSpaceOrdering && 124 FailureOrdering == AtomicOrdering::NotAtomic); 125 return; 126 } 127 128 assert(Scope != SIAtomicScope::NONE && 129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE && 131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 132 SIAtomicAddrSpace::NONE); 133 134 // There is also no cross address space ordering if the ordering 135 // address space is the same as the instruction address space and 136 // only contains a single address space. 137 if ((OrderingAddrSpace == InstrAddrSpace) && 138 isPowerOf2_32(uint32_t(InstrAddrSpace))) 139 this->IsCrossAddressSpaceOrdering = false; 140 141 // Limit the scope to the maximum supported by the instruction's address 142 // spaces. 143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 144 SIAtomicAddrSpace::NONE) { 145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 146 } else if ((InstrAddrSpace & 147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 148 SIAtomicAddrSpace::NONE) { 149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 150 } else if ((InstrAddrSpace & 151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 153 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 154 } 155 } 156 157 public: 158 /// \returns Atomic synchronization scope of the machine instruction used to 159 /// create this SIMemOpInfo. 160 SIAtomicScope getScope() const { 161 return Scope; 162 } 163 164 /// \returns Ordering constraint of the machine instruction used to 165 /// create this SIMemOpInfo. 166 AtomicOrdering getOrdering() const { 167 return Ordering; 168 } 169 170 /// \returns Failure ordering constraint of the machine instruction used to 171 /// create this SIMemOpInfo. 172 AtomicOrdering getFailureOrdering() const { 173 return FailureOrdering; 174 } 175 176 /// \returns The address spaces be accessed by the machine 177 /// instruction used to create this SIMemOpInfo. 178 SIAtomicAddrSpace getInstrAddrSpace() const { 179 return InstrAddrSpace; 180 } 181 182 /// \returns The address spaces that must be ordered by the machine 183 /// instruction used to create this SIMemOpInfo. 184 SIAtomicAddrSpace getOrderingAddrSpace() const { 185 return OrderingAddrSpace; 186 } 187 188 /// \returns Return true iff memory ordering of operations on 189 /// different address spaces is required. 190 bool getIsCrossAddressSpaceOrdering() const { 191 return IsCrossAddressSpaceOrdering; 192 } 193 194 /// \returns True if memory access of the machine instruction used to 195 /// create this SIMemOpInfo is volatile, false otherwise. 196 bool isVolatile() const { 197 return IsVolatile; 198 } 199 200 /// \returns True if memory access of the machine instruction used to 201 /// create this SIMemOpInfo is nontemporal, false otherwise. 202 bool isNonTemporal() const { 203 return IsNonTemporal; 204 } 205 206 /// \returns True if memory access of the machine instruction used to 207 /// create this SIMemOpInfo is last use, false otherwise. 208 bool isLastUse() const { return IsLastUse; } 209 210 /// \returns True if ordering constraint of the machine instruction used to 211 /// create this SIMemOpInfo is unordered or higher, false otherwise. 212 bool isAtomic() const { 213 return Ordering != AtomicOrdering::NotAtomic; 214 } 215 216 }; 217 218 class SIMemOpAccess final { 219 private: 220 const AMDGPUMachineModuleInfo *MMI = nullptr; 221 222 /// Reports unsupported message \p Msg for \p MI to LLVM context. 223 void reportUnsupported(const MachineBasicBlock::iterator &MI, 224 const char *Msg) const; 225 226 /// Inspects the target synchronization scope \p SSID and determines 227 /// the SI atomic scope it corresponds to, the address spaces it 228 /// covers, and whether the memory ordering applies between address 229 /// spaces. 230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 232 233 /// \return Return a bit set of the address spaces accessed by \p AS. 234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 235 236 /// \returns Info constructed from \p MI, which has at least machine memory 237 /// operand. 238 std::optional<SIMemOpInfo> 239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 240 241 public: 242 /// Construct class to support accessing the machine memory operands 243 /// of instructions in the machine function \p MF. 244 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI); 245 246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 247 std::optional<SIMemOpInfo> 248 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 249 250 /// \returns Store info if \p MI is a store operation, "std::nullopt" 251 /// otherwise. 252 std::optional<SIMemOpInfo> 253 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 254 255 /// \returns Atomic fence info if \p MI is an atomic fence operation, 256 /// "std::nullopt" otherwise. 257 std::optional<SIMemOpInfo> 258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 259 260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 261 /// rmw operation, "std::nullopt" otherwise. 262 std::optional<SIMemOpInfo> 263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 264 }; 265 266 class SICacheControl { 267 protected: 268 269 /// AMDGPU subtarget info. 270 const GCNSubtarget &ST; 271 272 /// Instruction info. 273 const SIInstrInfo *TII = nullptr; 274 275 IsaVersion IV; 276 277 /// Whether to insert cache invalidating instructions. 278 bool InsertCacheInv; 279 280 SICacheControl(const GCNSubtarget &ST); 281 282 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 283 /// \returns Returns true if \p MI is modified, false otherwise. 284 bool enableNamedBit(const MachineBasicBlock::iterator MI, 285 AMDGPU::CPol::CPol Bit) const; 286 287 public: 288 289 /// Create a cache control for the subtarget \p ST. 290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 291 292 /// Update \p MI memory load instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory store instruction to bypass any caches up to 300 /// the \p Scope memory scope for address spaces \p 301 /// AddrSpace. Return true iff the instruction was modified. 302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory read-modify-write instruction to bypass any caches up 307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 308 /// iff the instruction was modified. 309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 310 SIAtomicScope Scope, 311 SIAtomicAddrSpace AddrSpace) const = 0; 312 313 /// Update \p MI memory instruction of kind \p Op associated with address 314 /// spaces \p AddrSpace to indicate it is volatile and/or 315 /// nontemporal/last-use. Return true iff the instruction was modified. 316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 317 SIAtomicAddrSpace AddrSpace, 318 SIMemOp Op, bool IsVolatile, 319 bool IsNonTemporal, 320 bool IsLastUse = false) const = 0; 321 322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { 323 return false; 324 }; 325 326 /// Inserts any necessary instructions at position \p Pos relative 327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 328 /// \p Op associated with address spaces \p AddrSpace have completed. Used 329 /// between memory instructions to enforce the order they become visible as 330 /// observed by other memory instructions executing in memory scope \p Scope. 331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 332 /// address spaces. Returns true iff any instructions inserted. 333 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 335 bool IsCrossAddrSpaceOrdering, Position Pos, 336 AtomicOrdering Order) const = 0; 337 338 /// Inserts any necessary instructions at position \p Pos relative to 339 /// instruction \p MI to ensure any subsequent memory instructions of this 340 /// thread with address spaces \p AddrSpace will observe the previous memory 341 /// operations by any thread for memory scopes up to memory scope \p Scope . 342 /// Returns true iff any instructions inserted. 343 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace, 346 Position Pos) const = 0; 347 348 /// Inserts any necessary instructions at position \p Pos relative to 349 /// instruction \p MI to ensure previous memory instructions by this thread 350 /// with address spaces \p AddrSpace have completed and can be observed by 351 /// subsequent memory instructions by any thread executing in memory scope \p 352 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 353 /// between address spaces. Returns true iff any instructions inserted. 354 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 355 SIAtomicScope Scope, 356 SIAtomicAddrSpace AddrSpace, 357 bool IsCrossAddrSpaceOrdering, 358 Position Pos) const = 0; 359 360 /// Virtual destructor to allow derivations to be deleted. 361 virtual ~SICacheControl() = default; 362 363 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 364 MachineBasicBlock::iterator &MI) const { 365 return false; 366 } 367 }; 368 369 class SIGfx6CacheControl : public SICacheControl { 370 protected: 371 372 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 373 /// is modified, false otherwise. 374 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 375 return enableNamedBit(MI, AMDGPU::CPol::GLC); 376 } 377 378 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 379 /// is modified, false otherwise. 380 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 381 return enableNamedBit(MI, AMDGPU::CPol::SLC); 382 } 383 384 public: 385 386 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 387 388 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 389 SIAtomicScope Scope, 390 SIAtomicAddrSpace AddrSpace) const override; 391 392 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 393 SIAtomicScope Scope, 394 SIAtomicAddrSpace AddrSpace) const override; 395 396 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace) const override; 399 400 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 401 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 402 bool IsVolatile, bool IsNonTemporal, 403 bool IsLastUse) const override; 404 405 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 406 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 407 bool IsCrossAddrSpaceOrdering, Position Pos, 408 AtomicOrdering Order) const override; 409 410 bool insertAcquire(MachineBasicBlock::iterator &MI, 411 SIAtomicScope Scope, 412 SIAtomicAddrSpace AddrSpace, 413 Position Pos) const override; 414 415 bool insertRelease(MachineBasicBlock::iterator &MI, 416 SIAtomicScope Scope, 417 SIAtomicAddrSpace AddrSpace, 418 bool IsCrossAddrSpaceOrdering, 419 Position Pos) const override; 420 }; 421 422 class SIGfx7CacheControl : public SIGfx6CacheControl { 423 public: 424 425 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 426 427 bool insertAcquire(MachineBasicBlock::iterator &MI, 428 SIAtomicScope Scope, 429 SIAtomicAddrSpace AddrSpace, 430 Position Pos) const override; 431 432 }; 433 434 class SIGfx90ACacheControl : public SIGfx7CacheControl { 435 public: 436 437 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 438 439 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 440 SIAtomicScope Scope, 441 SIAtomicAddrSpace AddrSpace) const override; 442 443 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace) const override; 446 447 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 448 SIAtomicScope Scope, 449 SIAtomicAddrSpace AddrSpace) const override; 450 451 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 452 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 453 bool IsVolatile, bool IsNonTemporal, 454 bool IsLastUse) const override; 455 456 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 457 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 458 bool IsCrossAddrSpaceOrdering, Position Pos, 459 AtomicOrdering Order) const override; 460 461 bool insertAcquire(MachineBasicBlock::iterator &MI, 462 SIAtomicScope Scope, 463 SIAtomicAddrSpace AddrSpace, 464 Position Pos) const override; 465 466 bool insertRelease(MachineBasicBlock::iterator &MI, 467 SIAtomicScope Scope, 468 SIAtomicAddrSpace AddrSpace, 469 bool IsCrossAddrSpaceOrdering, 470 Position Pos) const override; 471 }; 472 473 class SIGfx940CacheControl : public SIGfx90ACacheControl { 474 protected: 475 476 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 477 /// is modified, false otherwise. 478 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 479 return enableNamedBit(MI, AMDGPU::CPol::SC0); 480 } 481 482 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 483 /// is modified, false otherwise. 484 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 485 return enableNamedBit(MI, AMDGPU::CPol::SC1); 486 } 487 488 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 489 /// is modified, false otherwise. 490 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 491 return enableNamedBit(MI, AMDGPU::CPol::NT); 492 } 493 494 public: 495 496 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 497 498 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 499 SIAtomicScope Scope, 500 SIAtomicAddrSpace AddrSpace) const override; 501 502 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 503 SIAtomicScope Scope, 504 SIAtomicAddrSpace AddrSpace) const override; 505 506 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 507 SIAtomicScope Scope, 508 SIAtomicAddrSpace AddrSpace) const override; 509 510 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 511 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 512 bool IsVolatile, bool IsNonTemporal, 513 bool IsLastUse) const override; 514 515 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 516 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 517 518 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 519 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 520 Position Pos) const override; 521 522 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 523 MachineBasicBlock::iterator &MI) const override { 524 bool Changed = false; 525 if (ST.hasForceStoreSC0SC1() && 526 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 527 SIAtomicAddrSpace::GLOBAL | 528 SIAtomicAddrSpace::OTHER)) != 529 SIAtomicAddrSpace::NONE) { 530 Changed |= enableSC0Bit(MI); 531 Changed |= enableSC1Bit(MI); 532 } 533 return Changed; 534 } 535 }; 536 537 class SIGfx10CacheControl : public SIGfx7CacheControl { 538 protected: 539 540 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 541 /// is modified, false otherwise. 542 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 543 return enableNamedBit(MI, AMDGPU::CPol::DLC); 544 } 545 546 public: 547 548 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 549 550 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 551 SIAtomicScope Scope, 552 SIAtomicAddrSpace AddrSpace) const override; 553 554 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 555 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 556 bool IsVolatile, bool IsNonTemporal, 557 bool IsLastUse) const override; 558 559 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 560 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 561 bool IsCrossAddrSpaceOrdering, Position Pos, 562 AtomicOrdering Order) const override; 563 564 bool insertAcquire(MachineBasicBlock::iterator &MI, 565 SIAtomicScope Scope, 566 SIAtomicAddrSpace AddrSpace, 567 Position Pos) const override; 568 }; 569 570 class SIGfx11CacheControl : public SIGfx10CacheControl { 571 public: 572 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 573 574 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 575 SIAtomicScope Scope, 576 SIAtomicAddrSpace AddrSpace) const override; 577 578 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 579 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 580 bool IsVolatile, bool IsNonTemporal, 581 bool IsLastUse) const override; 582 }; 583 584 class SIGfx12CacheControl : public SIGfx11CacheControl { 585 protected: 586 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 587 // \returns Returns true if \p MI is modified, false otherwise. 588 bool setTH(const MachineBasicBlock::iterator MI, 589 AMDGPU::CPol::CPol Value) const; 590 // Sets Scope policy to \p Value if CPol operand is present in instruction \p 591 // MI. \returns Returns true if \p MI is modified, false otherwise. 592 bool setScope(const MachineBasicBlock::iterator MI, 593 AMDGPU::CPol::CPol Value) const; 594 595 // Stores with system scope (SCOPE_SYS) need to wait for: 596 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 597 // - non-returning-atomics - wait for STORECNT==0 598 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits 599 // since it does not distinguish atomics-with-return from regular stores. 600 // There is no need to wait if memory is cached (mtype != UC). 601 bool 602 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; 603 604 bool setAtomicScope(const MachineBasicBlock::iterator &MI, 605 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; 606 607 public: 608 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 609 610 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 611 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 612 bool IsCrossAddrSpaceOrdering, Position Pos, 613 AtomicOrdering Order) const override; 614 615 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 616 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 617 618 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 619 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 620 bool IsVolatile, bool IsNonTemporal, 621 bool IsLastUse) const override; 622 623 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; 624 625 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 626 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 627 Position Pos) const override; 628 629 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 630 SIAtomicScope Scope, 631 SIAtomicAddrSpace AddrSpace) const override { 632 return setAtomicScope(MI, Scope, AddrSpace); 633 } 634 635 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 636 SIAtomicScope Scope, 637 SIAtomicAddrSpace AddrSpace) const override { 638 return setAtomicScope(MI, Scope, AddrSpace); 639 } 640 641 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 642 SIAtomicScope Scope, 643 SIAtomicAddrSpace AddrSpace) const override { 644 return setAtomicScope(MI, Scope, AddrSpace); 645 } 646 }; 647 648 class SIMemoryLegalizer final : public MachineFunctionPass { 649 private: 650 651 /// Cache Control. 652 std::unique_ptr<SICacheControl> CC = nullptr; 653 654 /// List of atomic pseudo instructions. 655 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 656 657 /// Return true iff instruction \p MI is a atomic instruction that 658 /// returns a result. 659 bool isAtomicRet(const MachineInstr &MI) const { 660 return SIInstrInfo::isAtomicRet(MI); 661 } 662 663 /// Removes all processed atomic pseudo instructions from the current 664 /// function. Returns true if current function is modified, false otherwise. 665 bool removeAtomicPseudoMIs(); 666 667 /// Expands load operation \p MI. Returns true if instructions are 668 /// added/deleted or \p MI is modified, false otherwise. 669 bool expandLoad(const SIMemOpInfo &MOI, 670 MachineBasicBlock::iterator &MI); 671 /// Expands store operation \p MI. Returns true if instructions are 672 /// added/deleted or \p MI is modified, false otherwise. 673 bool expandStore(const SIMemOpInfo &MOI, 674 MachineBasicBlock::iterator &MI); 675 /// Expands atomic fence operation \p MI. Returns true if 676 /// instructions are added/deleted or \p MI is modified, false otherwise. 677 bool expandAtomicFence(const SIMemOpInfo &MOI, 678 MachineBasicBlock::iterator &MI); 679 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 680 /// instructions are added/deleted or \p MI is modified, false otherwise. 681 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 682 MachineBasicBlock::iterator &MI); 683 684 public: 685 static char ID; 686 687 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 688 689 void getAnalysisUsage(AnalysisUsage &AU) const override { 690 AU.setPreservesCFG(); 691 MachineFunctionPass::getAnalysisUsage(AU); 692 } 693 694 StringRef getPassName() const override { 695 return PASS_NAME; 696 } 697 698 bool runOnMachineFunction(MachineFunction &MF) override; 699 }; 700 701 static const StringMap<SIAtomicAddrSpace> ASNames = {{ 702 {"global", SIAtomicAddrSpace::GLOBAL}, 703 {"local", SIAtomicAddrSpace::LDS}, 704 }}; 705 706 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { 707 const MachineFunction *MF = MI.getMF(); 708 const Function &Fn = MF->getFunction(); 709 SmallString<128> Str; 710 raw_svector_ostream OS(Str); 711 OS << "unknown address space '" << AS << "'; expected one of "; 712 ListSeparator LS; 713 for (const auto &[Name, Val] : ASNames) 714 OS << LS << '\'' << Name << '\''; 715 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning); 716 Fn.getContext().diagnose(BadTag); 717 } 718 719 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. 720 /// If this tag isn't present, or if it has no meaningful values, returns \p 721 /// Default. Otherwise returns all the address spaces concerned by the MMRA. 722 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, 723 SIAtomicAddrSpace Default) { 724 static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; 725 726 auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); 727 if (!MMRA) 728 return Default; 729 730 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; 731 for (const auto &[Prefix, Suffix] : MMRA) { 732 if (Prefix != FenceASPrefix) 733 continue; 734 735 if (auto It = ASNames.find(Suffix); It != ASNames.end()) 736 Result |= It->second; 737 else 738 diagnoseUnknownMMRAASName(MI, Suffix); 739 } 740 741 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; 742 } 743 744 } // end anonymous namespace 745 746 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 747 const char *Msg) const { 748 const Function &Func = MI->getParent()->getParent()->getFunction(); 749 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 750 Func.getContext().diagnose(Diag); 751 } 752 753 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 754 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 755 SIAtomicAddrSpace InstrAddrSpace) const { 756 if (SSID == SyncScope::System) 757 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 758 if (SSID == MMI->getAgentSSID()) 759 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 760 if (SSID == MMI->getWorkgroupSSID()) 761 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 762 true); 763 if (SSID == MMI->getWavefrontSSID()) 764 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 765 true); 766 if (SSID == SyncScope::SingleThread) 767 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 768 true); 769 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 770 return std::tuple(SIAtomicScope::SYSTEM, 771 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 772 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 773 return std::tuple(SIAtomicScope::AGENT, 774 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 775 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 776 return std::tuple(SIAtomicScope::WORKGROUP, 777 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 778 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 779 return std::tuple(SIAtomicScope::WAVEFRONT, 780 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 781 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 782 return std::tuple(SIAtomicScope::SINGLETHREAD, 783 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 784 return std::nullopt; 785 } 786 787 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 788 if (AS == AMDGPUAS::FLAT_ADDRESS) 789 return SIAtomicAddrSpace::FLAT; 790 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 791 return SIAtomicAddrSpace::GLOBAL; 792 if (AS == AMDGPUAS::LOCAL_ADDRESS) 793 return SIAtomicAddrSpace::LDS; 794 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 795 return SIAtomicAddrSpace::SCRATCH; 796 if (AS == AMDGPUAS::REGION_ADDRESS) 797 return SIAtomicAddrSpace::GDS; 798 799 return SIAtomicAddrSpace::OTHER; 800 } 801 802 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_) 803 : MMI(&MMI_) {} 804 805 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 806 const MachineBasicBlock::iterator &MI) const { 807 assert(MI->getNumMemOperands() > 0); 808 809 SyncScope::ID SSID = SyncScope::SingleThread; 810 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 811 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 812 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 813 bool IsNonTemporal = true; 814 bool IsVolatile = false; 815 bool IsLastUse = false; 816 817 // Validator should check whether or not MMOs cover the entire set of 818 // locations accessed by the memory instruction. 819 for (const auto &MMO : MI->memoperands()) { 820 IsNonTemporal &= MMO->isNonTemporal(); 821 IsVolatile |= MMO->isVolatile(); 822 IsLastUse |= MMO->getFlags() & MOLastUse; 823 InstrAddrSpace |= 824 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 825 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 826 if (OpOrdering != AtomicOrdering::NotAtomic) { 827 const auto &IsSyncScopeInclusion = 828 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 829 if (!IsSyncScopeInclusion) { 830 reportUnsupported(MI, 831 "Unsupported non-inclusive atomic synchronization scope"); 832 return std::nullopt; 833 } 834 835 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 836 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 837 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 838 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 839 FailureOrdering = 840 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 841 } 842 } 843 844 SIAtomicScope Scope = SIAtomicScope::NONE; 845 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 846 bool IsCrossAddressSpaceOrdering = false; 847 if (Ordering != AtomicOrdering::NotAtomic) { 848 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 849 if (!ScopeOrNone) { 850 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 851 return std::nullopt; 852 } 853 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 854 *ScopeOrNone; 855 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 856 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 857 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 858 reportUnsupported(MI, "Unsupported atomic address space"); 859 return std::nullopt; 860 } 861 } 862 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 863 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 864 IsNonTemporal, IsLastUse); 865 } 866 867 std::optional<SIMemOpInfo> 868 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 869 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 870 871 if (!(MI->mayLoad() && !MI->mayStore())) 872 return std::nullopt; 873 874 // Be conservative if there are no memory operands. 875 if (MI->getNumMemOperands() == 0) 876 return SIMemOpInfo(); 877 878 return constructFromMIWithMMO(MI); 879 } 880 881 std::optional<SIMemOpInfo> 882 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 883 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 884 885 if (!(!MI->mayLoad() && MI->mayStore())) 886 return std::nullopt; 887 888 // Be conservative if there are no memory operands. 889 if (MI->getNumMemOperands() == 0) 890 return SIMemOpInfo(); 891 892 return constructFromMIWithMMO(MI); 893 } 894 895 std::optional<SIMemOpInfo> 896 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 897 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 898 899 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 900 return std::nullopt; 901 902 AtomicOrdering Ordering = 903 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 904 905 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 906 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 907 if (!ScopeOrNone) { 908 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 909 return std::nullopt; 910 } 911 912 SIAtomicScope Scope = SIAtomicScope::NONE; 913 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 914 bool IsCrossAddressSpaceOrdering = false; 915 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 916 *ScopeOrNone; 917 918 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 919 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 920 reportUnsupported(MI, "Unsupported atomic address space"); 921 return std::nullopt; 922 } 923 924 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 925 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 926 } 927 928 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 929 const MachineBasicBlock::iterator &MI) const { 930 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 931 932 if (!(MI->mayLoad() && MI->mayStore())) 933 return std::nullopt; 934 935 // Be conservative if there are no memory operands. 936 if (MI->getNumMemOperands() == 0) 937 return SIMemOpInfo(); 938 939 return constructFromMIWithMMO(MI); 940 } 941 942 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 943 TII = ST.getInstrInfo(); 944 IV = getIsaVersion(ST.getCPU()); 945 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 946 } 947 948 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 949 AMDGPU::CPol::CPol Bit) const { 950 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 951 if (!CPol) 952 return false; 953 954 CPol->setImm(CPol->getImm() | Bit); 955 return true; 956 } 957 958 /* static */ 959 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 960 GCNSubtarget::Generation Generation = ST.getGeneration(); 961 if (ST.hasGFX940Insts()) 962 return std::make_unique<SIGfx940CacheControl>(ST); 963 if (ST.hasGFX90AInsts()) 964 return std::make_unique<SIGfx90ACacheControl>(ST); 965 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 966 return std::make_unique<SIGfx6CacheControl>(ST); 967 if (Generation < AMDGPUSubtarget::GFX10) 968 return std::make_unique<SIGfx7CacheControl>(ST); 969 if (Generation < AMDGPUSubtarget::GFX11) 970 return std::make_unique<SIGfx10CacheControl>(ST); 971 if (Generation < AMDGPUSubtarget::GFX12) 972 return std::make_unique<SIGfx11CacheControl>(ST); 973 return std::make_unique<SIGfx12CacheControl>(ST); 974 } 975 976 bool SIGfx6CacheControl::enableLoadCacheBypass( 977 const MachineBasicBlock::iterator &MI, 978 SIAtomicScope Scope, 979 SIAtomicAddrSpace AddrSpace) const { 980 assert(MI->mayLoad() && !MI->mayStore()); 981 bool Changed = false; 982 983 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 984 switch (Scope) { 985 case SIAtomicScope::SYSTEM: 986 case SIAtomicScope::AGENT: 987 // Set L1 cache policy to MISS_EVICT. 988 // Note: there is no L2 cache bypass policy at the ISA level. 989 Changed |= enableGLCBit(MI); 990 break; 991 case SIAtomicScope::WORKGROUP: 992 case SIAtomicScope::WAVEFRONT: 993 case SIAtomicScope::SINGLETHREAD: 994 // No cache to bypass. 995 break; 996 default: 997 llvm_unreachable("Unsupported synchronization scope"); 998 } 999 } 1000 1001 /// The scratch address space does not need the global memory caches 1002 /// to be bypassed as all memory operations by the same thread are 1003 /// sequentially consistent, and no other thread can access scratch 1004 /// memory. 1005 1006 /// Other address spaces do not have a cache. 1007 1008 return Changed; 1009 } 1010 1011 bool SIGfx6CacheControl::enableStoreCacheBypass( 1012 const MachineBasicBlock::iterator &MI, 1013 SIAtomicScope Scope, 1014 SIAtomicAddrSpace AddrSpace) const { 1015 assert(!MI->mayLoad() && MI->mayStore()); 1016 bool Changed = false; 1017 1018 /// The L1 cache is write through so does not need to be bypassed. There is no 1019 /// bypass control for the L2 cache at the isa level. 1020 1021 return Changed; 1022 } 1023 1024 bool SIGfx6CacheControl::enableRMWCacheBypass( 1025 const MachineBasicBlock::iterator &MI, 1026 SIAtomicScope Scope, 1027 SIAtomicAddrSpace AddrSpace) const { 1028 assert(MI->mayLoad() && MI->mayStore()); 1029 bool Changed = false; 1030 1031 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 1032 /// bypassed, and the GLC bit is instead used to indicate if they are 1033 /// return or no-return. 1034 /// Note: there is no L2 cache coherent bypass control at the ISA level. 1035 1036 return Changed; 1037 } 1038 1039 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 1040 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1041 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1042 // Only handle load and store, not atomic read-modify-write insructions. The 1043 // latter use glc to indicate if the atomic returns a result and so must not 1044 // be used for cache control. 1045 assert(MI->mayLoad() ^ MI->mayStore()); 1046 1047 // Only update load and store, not LLVM IR atomic read-modify-write 1048 // instructions. The latter are always marked as volatile so cannot sensibly 1049 // handle it as do not want to pessimize all atomics. Also they do not support 1050 // the nontemporal attribute. 1051 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1052 1053 bool Changed = false; 1054 1055 if (IsVolatile) { 1056 // Set L1 cache policy to be MISS_EVICT for load instructions 1057 // and MISS_LRU for store instructions. 1058 // Note: there is no L2 cache bypass policy at the ISA level. 1059 if (Op == SIMemOp::LOAD) 1060 Changed |= enableGLCBit(MI); 1061 1062 // Ensure operation has completed at system scope to cause all volatile 1063 // operations to be visible outside the program in a global order. Do not 1064 // request cross address space as only the global address space can be 1065 // observable outside the program, so no need to cause a waitcnt for LDS 1066 // address space operations. 1067 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1068 Position::AFTER, AtomicOrdering::Unordered); 1069 1070 return Changed; 1071 } 1072 1073 if (IsNonTemporal) { 1074 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1075 // for both loads and stores, and the L2 cache policy to STREAM. 1076 Changed |= enableGLCBit(MI); 1077 Changed |= enableSLCBit(MI); 1078 return Changed; 1079 } 1080 1081 return Changed; 1082 } 1083 1084 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1085 SIAtomicScope Scope, 1086 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1087 bool IsCrossAddrSpaceOrdering, Position Pos, 1088 AtomicOrdering Order) const { 1089 bool Changed = false; 1090 1091 MachineBasicBlock &MBB = *MI->getParent(); 1092 DebugLoc DL = MI->getDebugLoc(); 1093 1094 if (Pos == Position::AFTER) 1095 ++MI; 1096 1097 bool VMCnt = false; 1098 bool LGKMCnt = false; 1099 1100 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1101 SIAtomicAddrSpace::NONE) { 1102 switch (Scope) { 1103 case SIAtomicScope::SYSTEM: 1104 case SIAtomicScope::AGENT: 1105 VMCnt |= true; 1106 break; 1107 case SIAtomicScope::WORKGROUP: 1108 case SIAtomicScope::WAVEFRONT: 1109 case SIAtomicScope::SINGLETHREAD: 1110 // The L1 cache keeps all memory operations in order for 1111 // wavefronts in the same work-group. 1112 break; 1113 default: 1114 llvm_unreachable("Unsupported synchronization scope"); 1115 } 1116 } 1117 1118 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1119 switch (Scope) { 1120 case SIAtomicScope::SYSTEM: 1121 case SIAtomicScope::AGENT: 1122 case SIAtomicScope::WORKGROUP: 1123 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1124 // not needed as LDS operations for all waves are executed in a total 1125 // global ordering as observed by all waves. Required if also 1126 // synchronizing with global/GDS memory as LDS operations could be 1127 // reordered with respect to later global/GDS memory operations of the 1128 // same wave. 1129 LGKMCnt |= IsCrossAddrSpaceOrdering; 1130 break; 1131 case SIAtomicScope::WAVEFRONT: 1132 case SIAtomicScope::SINGLETHREAD: 1133 // The LDS keeps all memory operations in order for 1134 // the same wavefront. 1135 break; 1136 default: 1137 llvm_unreachable("Unsupported synchronization scope"); 1138 } 1139 } 1140 1141 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1142 switch (Scope) { 1143 case SIAtomicScope::SYSTEM: 1144 case SIAtomicScope::AGENT: 1145 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1146 // is not needed as GDS operations for all waves are executed in a total 1147 // global ordering as observed by all waves. Required if also 1148 // synchronizing with global/LDS memory as GDS operations could be 1149 // reordered with respect to later global/LDS memory operations of the 1150 // same wave. 1151 LGKMCnt |= IsCrossAddrSpaceOrdering; 1152 break; 1153 case SIAtomicScope::WORKGROUP: 1154 case SIAtomicScope::WAVEFRONT: 1155 case SIAtomicScope::SINGLETHREAD: 1156 // The GDS keeps all memory operations in order for 1157 // the same work-group. 1158 break; 1159 default: 1160 llvm_unreachable("Unsupported synchronization scope"); 1161 } 1162 } 1163 1164 if (VMCnt || LGKMCnt) { 1165 unsigned WaitCntImmediate = 1166 AMDGPU::encodeWaitcnt(IV, 1167 VMCnt ? 0 : getVmcntBitMask(IV), 1168 getExpcntBitMask(IV), 1169 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1170 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1171 .addImm(WaitCntImmediate); 1172 Changed = true; 1173 } 1174 1175 if (Pos == Position::AFTER) 1176 --MI; 1177 1178 return Changed; 1179 } 1180 1181 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1182 SIAtomicScope Scope, 1183 SIAtomicAddrSpace AddrSpace, 1184 Position Pos) const { 1185 if (!InsertCacheInv) 1186 return false; 1187 1188 bool Changed = false; 1189 1190 MachineBasicBlock &MBB = *MI->getParent(); 1191 DebugLoc DL = MI->getDebugLoc(); 1192 1193 if (Pos == Position::AFTER) 1194 ++MI; 1195 1196 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1197 switch (Scope) { 1198 case SIAtomicScope::SYSTEM: 1199 case SIAtomicScope::AGENT: 1200 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1201 Changed = true; 1202 break; 1203 case SIAtomicScope::WORKGROUP: 1204 case SIAtomicScope::WAVEFRONT: 1205 case SIAtomicScope::SINGLETHREAD: 1206 // No cache to invalidate. 1207 break; 1208 default: 1209 llvm_unreachable("Unsupported synchronization scope"); 1210 } 1211 } 1212 1213 /// The scratch address space does not need the global memory cache 1214 /// to be flushed as all memory operations by the same thread are 1215 /// sequentially consistent, and no other thread can access scratch 1216 /// memory. 1217 1218 /// Other address spaces do not have a cache. 1219 1220 if (Pos == Position::AFTER) 1221 --MI; 1222 1223 return Changed; 1224 } 1225 1226 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1227 SIAtomicScope Scope, 1228 SIAtomicAddrSpace AddrSpace, 1229 bool IsCrossAddrSpaceOrdering, 1230 Position Pos) const { 1231 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1232 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); 1233 } 1234 1235 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1236 SIAtomicScope Scope, 1237 SIAtomicAddrSpace AddrSpace, 1238 Position Pos) const { 1239 if (!InsertCacheInv) 1240 return false; 1241 1242 bool Changed = false; 1243 1244 MachineBasicBlock &MBB = *MI->getParent(); 1245 DebugLoc DL = MI->getDebugLoc(); 1246 1247 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1248 1249 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1250 ? AMDGPU::BUFFER_WBINVL1 1251 : AMDGPU::BUFFER_WBINVL1_VOL; 1252 1253 if (Pos == Position::AFTER) 1254 ++MI; 1255 1256 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1257 switch (Scope) { 1258 case SIAtomicScope::SYSTEM: 1259 case SIAtomicScope::AGENT: 1260 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1261 Changed = true; 1262 break; 1263 case SIAtomicScope::WORKGROUP: 1264 case SIAtomicScope::WAVEFRONT: 1265 case SIAtomicScope::SINGLETHREAD: 1266 // No cache to invalidate. 1267 break; 1268 default: 1269 llvm_unreachable("Unsupported synchronization scope"); 1270 } 1271 } 1272 1273 /// The scratch address space does not need the global memory cache 1274 /// to be flushed as all memory operations by the same thread are 1275 /// sequentially consistent, and no other thread can access scratch 1276 /// memory. 1277 1278 /// Other address spaces do not have a cache. 1279 1280 if (Pos == Position::AFTER) 1281 --MI; 1282 1283 return Changed; 1284 } 1285 1286 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1287 const MachineBasicBlock::iterator &MI, 1288 SIAtomicScope Scope, 1289 SIAtomicAddrSpace AddrSpace) const { 1290 assert(MI->mayLoad() && !MI->mayStore()); 1291 bool Changed = false; 1292 1293 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1294 switch (Scope) { 1295 case SIAtomicScope::SYSTEM: 1296 case SIAtomicScope::AGENT: 1297 // Set the L1 cache policy to MISS_LRU. 1298 // Note: there is no L2 cache bypass policy at the ISA level. 1299 Changed |= enableGLCBit(MI); 1300 break; 1301 case SIAtomicScope::WORKGROUP: 1302 // In threadgroup split mode the waves of a work-group can be executing on 1303 // different CUs. Therefore need to bypass the L1 which is per CU. 1304 // Otherwise in non-threadgroup split mode all waves of a work-group are 1305 // on the same CU, and so the L1 does not need to be bypassed. 1306 if (ST.isTgSplitEnabled()) 1307 Changed |= enableGLCBit(MI); 1308 break; 1309 case SIAtomicScope::WAVEFRONT: 1310 case SIAtomicScope::SINGLETHREAD: 1311 // No cache to bypass. 1312 break; 1313 default: 1314 llvm_unreachable("Unsupported synchronization scope"); 1315 } 1316 } 1317 1318 /// The scratch address space does not need the global memory caches 1319 /// to be bypassed as all memory operations by the same thread are 1320 /// sequentially consistent, and no other thread can access scratch 1321 /// memory. 1322 1323 /// Other address spaces do not have a cache. 1324 1325 return Changed; 1326 } 1327 1328 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1329 const MachineBasicBlock::iterator &MI, 1330 SIAtomicScope Scope, 1331 SIAtomicAddrSpace AddrSpace) const { 1332 assert(!MI->mayLoad() && MI->mayStore()); 1333 bool Changed = false; 1334 1335 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1336 switch (Scope) { 1337 case SIAtomicScope::SYSTEM: 1338 case SIAtomicScope::AGENT: 1339 /// Do not set glc for store atomic operations as they implicitly write 1340 /// through the L1 cache. 1341 break; 1342 case SIAtomicScope::WORKGROUP: 1343 case SIAtomicScope::WAVEFRONT: 1344 case SIAtomicScope::SINGLETHREAD: 1345 // No cache to bypass. Store atomics implicitly write through the L1 1346 // cache. 1347 break; 1348 default: 1349 llvm_unreachable("Unsupported synchronization scope"); 1350 } 1351 } 1352 1353 /// The scratch address space does not need the global memory caches 1354 /// to be bypassed as all memory operations by the same thread are 1355 /// sequentially consistent, and no other thread can access scratch 1356 /// memory. 1357 1358 /// Other address spaces do not have a cache. 1359 1360 return Changed; 1361 } 1362 1363 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1364 const MachineBasicBlock::iterator &MI, 1365 SIAtomicScope Scope, 1366 SIAtomicAddrSpace AddrSpace) const { 1367 assert(MI->mayLoad() && MI->mayStore()); 1368 bool Changed = false; 1369 1370 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1371 switch (Scope) { 1372 case SIAtomicScope::SYSTEM: 1373 case SIAtomicScope::AGENT: 1374 /// Do not set glc for RMW atomic operations as they implicitly bypass 1375 /// the L1 cache, and the glc bit is instead used to indicate if they are 1376 /// return or no-return. 1377 break; 1378 case SIAtomicScope::WORKGROUP: 1379 case SIAtomicScope::WAVEFRONT: 1380 case SIAtomicScope::SINGLETHREAD: 1381 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1382 break; 1383 default: 1384 llvm_unreachable("Unsupported synchronization scope"); 1385 } 1386 } 1387 1388 return Changed; 1389 } 1390 1391 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1392 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1393 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1394 // Only handle load and store, not atomic read-modify-write insructions. The 1395 // latter use glc to indicate if the atomic returns a result and so must not 1396 // be used for cache control. 1397 assert(MI->mayLoad() ^ MI->mayStore()); 1398 1399 // Only update load and store, not LLVM IR atomic read-modify-write 1400 // instructions. The latter are always marked as volatile so cannot sensibly 1401 // handle it as do not want to pessimize all atomics. Also they do not support 1402 // the nontemporal attribute. 1403 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1404 1405 bool Changed = false; 1406 1407 if (IsVolatile) { 1408 // Set L1 cache policy to be MISS_EVICT for load instructions 1409 // and MISS_LRU for store instructions. 1410 // Note: there is no L2 cache bypass policy at the ISA level. 1411 if (Op == SIMemOp::LOAD) 1412 Changed |= enableGLCBit(MI); 1413 1414 // Ensure operation has completed at system scope to cause all volatile 1415 // operations to be visible outside the program in a global order. Do not 1416 // request cross address space as only the global address space can be 1417 // observable outside the program, so no need to cause a waitcnt for LDS 1418 // address space operations. 1419 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1420 Position::AFTER, AtomicOrdering::Unordered); 1421 1422 return Changed; 1423 } 1424 1425 if (IsNonTemporal) { 1426 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1427 // for both loads and stores, and the L2 cache policy to STREAM. 1428 Changed |= enableGLCBit(MI); 1429 Changed |= enableSLCBit(MI); 1430 return Changed; 1431 } 1432 1433 return Changed; 1434 } 1435 1436 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1437 SIAtomicScope Scope, 1438 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1439 bool IsCrossAddrSpaceOrdering, 1440 Position Pos, 1441 AtomicOrdering Order) const { 1442 if (ST.isTgSplitEnabled()) { 1443 // In threadgroup split mode the waves of a work-group can be executing on 1444 // different CUs. Therefore need to wait for global or GDS memory operations 1445 // to complete to ensure they are visible to waves in the other CUs. 1446 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1447 // the same CU, so no need to wait for global memory as all waves in the 1448 // work-group access the same the L1, nor wait for GDS as access are ordered 1449 // on a CU. 1450 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1451 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1452 (Scope == SIAtomicScope::WORKGROUP)) { 1453 // Same as GFX7 using agent scope. 1454 Scope = SIAtomicScope::AGENT; 1455 } 1456 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1457 // LDS memory operations. 1458 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1459 } 1460 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1461 IsCrossAddrSpaceOrdering, Pos, Order); 1462 } 1463 1464 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1465 SIAtomicScope Scope, 1466 SIAtomicAddrSpace AddrSpace, 1467 Position Pos) const { 1468 if (!InsertCacheInv) 1469 return false; 1470 1471 bool Changed = false; 1472 1473 MachineBasicBlock &MBB = *MI->getParent(); 1474 DebugLoc DL = MI->getDebugLoc(); 1475 1476 if (Pos == Position::AFTER) 1477 ++MI; 1478 1479 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1480 switch (Scope) { 1481 case SIAtomicScope::SYSTEM: 1482 // Ensures that following loads will not see stale remote VMEM data or 1483 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1484 // CC will never be stale due to the local memory probes. 1485 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1486 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1487 // hardware does not reorder memory operations by the same wave with 1488 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1489 // remove any cache lines of earlier writes by the same wave and ensures 1490 // later reads by the same wave will refetch the cache lines. 1491 Changed = true; 1492 break; 1493 case SIAtomicScope::AGENT: 1494 // Same as GFX7. 1495 break; 1496 case SIAtomicScope::WORKGROUP: 1497 // In threadgroup split mode the waves of a work-group can be executing on 1498 // different CUs. Therefore need to invalidate the L1 which is per CU. 1499 // Otherwise in non-threadgroup split mode all waves of a work-group are 1500 // on the same CU, and so the L1 does not need to be invalidated. 1501 if (ST.isTgSplitEnabled()) { 1502 // Same as GFX7 using agent scope. 1503 Scope = SIAtomicScope::AGENT; 1504 } 1505 break; 1506 case SIAtomicScope::WAVEFRONT: 1507 case SIAtomicScope::SINGLETHREAD: 1508 // Same as GFX7. 1509 break; 1510 default: 1511 llvm_unreachable("Unsupported synchronization scope"); 1512 } 1513 } 1514 1515 /// The scratch address space does not need the global memory cache 1516 /// to be flushed as all memory operations by the same thread are 1517 /// sequentially consistent, and no other thread can access scratch 1518 /// memory. 1519 1520 /// Other address spaces do not have a cache. 1521 1522 if (Pos == Position::AFTER) 1523 --MI; 1524 1525 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1526 1527 return Changed; 1528 } 1529 1530 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1531 SIAtomicScope Scope, 1532 SIAtomicAddrSpace AddrSpace, 1533 bool IsCrossAddrSpaceOrdering, 1534 Position Pos) const { 1535 bool Changed = false; 1536 1537 MachineBasicBlock &MBB = *MI->getParent(); 1538 const DebugLoc &DL = MI->getDebugLoc(); 1539 1540 if (Pos == Position::AFTER) 1541 ++MI; 1542 1543 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1544 switch (Scope) { 1545 case SIAtomicScope::SYSTEM: 1546 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1547 // hardware does not reorder memory operations by the same wave with 1548 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1549 // to initiate writeback of any dirty cache lines of earlier writes by the 1550 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1551 // writeback has completed. 1552 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1553 // Set SC bits to indicate system scope. 1554 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1555 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1556 // vmcnt(0)" needed by the "BUFFER_WBL2". 1557 Changed = true; 1558 break; 1559 case SIAtomicScope::AGENT: 1560 case SIAtomicScope::WORKGROUP: 1561 case SIAtomicScope::WAVEFRONT: 1562 case SIAtomicScope::SINGLETHREAD: 1563 // Same as GFX7. 1564 break; 1565 default: 1566 llvm_unreachable("Unsupported synchronization scope"); 1567 } 1568 } 1569 1570 if (Pos == Position::AFTER) 1571 --MI; 1572 1573 Changed |= 1574 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1575 IsCrossAddrSpaceOrdering, Pos); 1576 1577 return Changed; 1578 } 1579 1580 bool SIGfx940CacheControl::enableLoadCacheBypass( 1581 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1582 SIAtomicAddrSpace AddrSpace) const { 1583 assert(MI->mayLoad() && !MI->mayStore()); 1584 bool Changed = false; 1585 1586 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1587 switch (Scope) { 1588 case SIAtomicScope::SYSTEM: 1589 // Set SC bits to indicate system scope. 1590 Changed |= enableSC0Bit(MI); 1591 Changed |= enableSC1Bit(MI); 1592 break; 1593 case SIAtomicScope::AGENT: 1594 // Set SC bits to indicate agent scope. 1595 Changed |= enableSC1Bit(MI); 1596 break; 1597 case SIAtomicScope::WORKGROUP: 1598 // In threadgroup split mode the waves of a work-group can be executing on 1599 // different CUs. Therefore need to bypass the L1 which is per CU. 1600 // Otherwise in non-threadgroup split mode all waves of a work-group are 1601 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1602 // bits to indicate work-group scope will do this automatically. 1603 Changed |= enableSC0Bit(MI); 1604 break; 1605 case SIAtomicScope::WAVEFRONT: 1606 case SIAtomicScope::SINGLETHREAD: 1607 // Leave SC bits unset to indicate wavefront scope. 1608 break; 1609 default: 1610 llvm_unreachable("Unsupported synchronization scope"); 1611 } 1612 } 1613 1614 /// The scratch address space does not need the global memory caches 1615 /// to be bypassed as all memory operations by the same thread are 1616 /// sequentially consistent, and no other thread can access scratch 1617 /// memory. 1618 1619 /// Other address spaces do not have a cache. 1620 1621 return Changed; 1622 } 1623 1624 bool SIGfx940CacheControl::enableStoreCacheBypass( 1625 const MachineBasicBlock::iterator &MI, 1626 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1627 assert(!MI->mayLoad() && MI->mayStore()); 1628 bool Changed = false; 1629 1630 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1631 switch (Scope) { 1632 case SIAtomicScope::SYSTEM: 1633 // Set SC bits to indicate system scope. 1634 Changed |= enableSC0Bit(MI); 1635 Changed |= enableSC1Bit(MI); 1636 break; 1637 case SIAtomicScope::AGENT: 1638 // Set SC bits to indicate agent scope. 1639 Changed |= enableSC1Bit(MI); 1640 break; 1641 case SIAtomicScope::WORKGROUP: 1642 // Set SC bits to indicate workgroup scope. 1643 Changed |= enableSC0Bit(MI); 1644 break; 1645 case SIAtomicScope::WAVEFRONT: 1646 case SIAtomicScope::SINGLETHREAD: 1647 // Leave SC bits unset to indicate wavefront scope. 1648 break; 1649 default: 1650 llvm_unreachable("Unsupported synchronization scope"); 1651 } 1652 } 1653 1654 /// The scratch address space does not need the global memory caches 1655 /// to be bypassed as all memory operations by the same thread are 1656 /// sequentially consistent, and no other thread can access scratch 1657 /// memory. 1658 1659 /// Other address spaces do not have a cache. 1660 1661 return Changed; 1662 } 1663 1664 bool SIGfx940CacheControl::enableRMWCacheBypass( 1665 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1666 SIAtomicAddrSpace AddrSpace) const { 1667 assert(MI->mayLoad() && MI->mayStore()); 1668 bool Changed = false; 1669 1670 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1671 switch (Scope) { 1672 case SIAtomicScope::SYSTEM: 1673 // Set SC1 bit to indicate system scope. 1674 Changed |= enableSC1Bit(MI); 1675 break; 1676 case SIAtomicScope::AGENT: 1677 case SIAtomicScope::WORKGROUP: 1678 case SIAtomicScope::WAVEFRONT: 1679 case SIAtomicScope::SINGLETHREAD: 1680 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1681 // to indicate system or agent scope. The SC0 bit is used to indicate if 1682 // they are return or no-return. Leave SC1 bit unset to indicate agent 1683 // scope. 1684 break; 1685 default: 1686 llvm_unreachable("Unsupported synchronization scope"); 1687 } 1688 } 1689 1690 return Changed; 1691 } 1692 1693 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1694 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1695 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1696 // Only handle load and store, not atomic read-modify-write insructions. The 1697 // latter use glc to indicate if the atomic returns a result and so must not 1698 // be used for cache control. 1699 assert(MI->mayLoad() ^ MI->mayStore()); 1700 1701 // Only update load and store, not LLVM IR atomic read-modify-write 1702 // instructions. The latter are always marked as volatile so cannot sensibly 1703 // handle it as do not want to pessimize all atomics. Also they do not support 1704 // the nontemporal attribute. 1705 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1706 1707 bool Changed = false; 1708 1709 if (IsVolatile) { 1710 // Set SC bits to indicate system scope. 1711 Changed |= enableSC0Bit(MI); 1712 Changed |= enableSC1Bit(MI); 1713 1714 // Ensure operation has completed at system scope to cause all volatile 1715 // operations to be visible outside the program in a global order. Do not 1716 // request cross address space as only the global address space can be 1717 // observable outside the program, so no need to cause a waitcnt for LDS 1718 // address space operations. 1719 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1720 Position::AFTER, AtomicOrdering::Unordered); 1721 1722 return Changed; 1723 } 1724 1725 if (IsNonTemporal) { 1726 Changed |= enableNTBit(MI); 1727 return Changed; 1728 } 1729 1730 return Changed; 1731 } 1732 1733 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1734 SIAtomicScope Scope, 1735 SIAtomicAddrSpace AddrSpace, 1736 Position Pos) const { 1737 if (!InsertCacheInv) 1738 return false; 1739 1740 bool Changed = false; 1741 1742 MachineBasicBlock &MBB = *MI->getParent(); 1743 DebugLoc DL = MI->getDebugLoc(); 1744 1745 if (Pos == Position::AFTER) 1746 ++MI; 1747 1748 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1749 switch (Scope) { 1750 case SIAtomicScope::SYSTEM: 1751 // Ensures that following loads will not see stale remote VMEM data or 1752 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1753 // CC will never be stale due to the local memory probes. 1754 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1755 // Set SC bits to indicate system scope. 1756 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1757 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1758 // hardware does not reorder memory operations by the same wave with 1759 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1760 // remove any cache lines of earlier writes by the same wave and ensures 1761 // later reads by the same wave will refetch the cache lines. 1762 Changed = true; 1763 break; 1764 case SIAtomicScope::AGENT: 1765 // Ensures that following loads will not see stale remote date or local 1766 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1767 // due to the memory probes. 1768 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1769 // Set SC bits to indicate agent scope. 1770 .addImm(AMDGPU::CPol::SC1); 1771 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1772 // does not reorder memory operations with respect to preceeding buffer 1773 // invalidate. The invalidate is guaranteed to remove any cache lines of 1774 // earlier writes and ensures later writes will refetch the cache lines. 1775 Changed = true; 1776 break; 1777 case SIAtomicScope::WORKGROUP: 1778 // In threadgroup split mode the waves of a work-group can be executing on 1779 // different CUs. Therefore need to invalidate the L1 which is per CU. 1780 // Otherwise in non-threadgroup split mode all waves of a work-group are 1781 // on the same CU, and so the L1 does not need to be invalidated. 1782 if (ST.isTgSplitEnabled()) { 1783 // Ensures L1 is invalidated if in threadgroup split mode. In 1784 // non-threadgroup split mode it is a NOP, but no point generating it in 1785 // that case if know not in that mode. 1786 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1787 // Set SC bits to indicate work-group scope. 1788 .addImm(AMDGPU::CPol::SC0); 1789 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1790 // does not reorder memory operations with respect to preceeding buffer 1791 // invalidate. The invalidate is guaranteed to remove any cache lines of 1792 // earlier writes and ensures later writes will refetch the cache lines. 1793 Changed = true; 1794 } 1795 break; 1796 case SIAtomicScope::WAVEFRONT: 1797 case SIAtomicScope::SINGLETHREAD: 1798 // Could generate "BUFFER_INV" but it would do nothing as there are no 1799 // caches to invalidate. 1800 break; 1801 default: 1802 llvm_unreachable("Unsupported synchronization scope"); 1803 } 1804 } 1805 1806 /// The scratch address space does not need the global memory cache 1807 /// to be flushed as all memory operations by the same thread are 1808 /// sequentially consistent, and no other thread can access scratch 1809 /// memory. 1810 1811 /// Other address spaces do not have a cache. 1812 1813 if (Pos == Position::AFTER) 1814 --MI; 1815 1816 return Changed; 1817 } 1818 1819 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1820 SIAtomicScope Scope, 1821 SIAtomicAddrSpace AddrSpace, 1822 bool IsCrossAddrSpaceOrdering, 1823 Position Pos) const { 1824 bool Changed = false; 1825 1826 MachineBasicBlock &MBB = *MI->getParent(); 1827 DebugLoc DL = MI->getDebugLoc(); 1828 1829 if (Pos == Position::AFTER) 1830 ++MI; 1831 1832 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1833 switch (Scope) { 1834 case SIAtomicScope::SYSTEM: 1835 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1836 // hardware does not reorder memory operations by the same wave with 1837 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1838 // to initiate writeback of any dirty cache lines of earlier writes by the 1839 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1840 // writeback has completed. 1841 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1842 // Set SC bits to indicate system scope. 1843 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1844 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1845 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1846 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1847 Changed = true; 1848 break; 1849 case SIAtomicScope::AGENT: 1850 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1851 // Set SC bits to indicate agent scope. 1852 .addImm(AMDGPU::CPol::SC1); 1853 1854 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1855 // SIAtomicScope::AGENT, the following insertWait will generate the 1856 // required "S_WAITCNT vmcnt(0)". 1857 Changed = true; 1858 break; 1859 case SIAtomicScope::WORKGROUP: 1860 case SIAtomicScope::WAVEFRONT: 1861 case SIAtomicScope::SINGLETHREAD: 1862 // Do not generate "BUFFER_WBL2" as there are no caches it would 1863 // writeback, and would require an otherwise unnecessary 1864 // "S_WAITCNT vmcnt(0)". 1865 break; 1866 default: 1867 llvm_unreachable("Unsupported synchronization scope"); 1868 } 1869 } 1870 1871 if (Pos == Position::AFTER) 1872 --MI; 1873 1874 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1875 // S_WAITCNT needed. 1876 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1877 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); 1878 1879 return Changed; 1880 } 1881 1882 bool SIGfx10CacheControl::enableLoadCacheBypass( 1883 const MachineBasicBlock::iterator &MI, 1884 SIAtomicScope Scope, 1885 SIAtomicAddrSpace AddrSpace) const { 1886 assert(MI->mayLoad() && !MI->mayStore()); 1887 bool Changed = false; 1888 1889 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1890 switch (Scope) { 1891 case SIAtomicScope::SYSTEM: 1892 case SIAtomicScope::AGENT: 1893 // Set the L0 and L1 cache policies to MISS_EVICT. 1894 // Note: there is no L2 cache coherent bypass control at the ISA level. 1895 Changed |= enableGLCBit(MI); 1896 Changed |= enableDLCBit(MI); 1897 break; 1898 case SIAtomicScope::WORKGROUP: 1899 // In WGP mode the waves of a work-group can be executing on either CU of 1900 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1901 // CU mode all waves of a work-group are on the same CU, and so the L0 1902 // does not need to be bypassed. 1903 if (!ST.isCuModeEnabled()) 1904 Changed |= enableGLCBit(MI); 1905 break; 1906 case SIAtomicScope::WAVEFRONT: 1907 case SIAtomicScope::SINGLETHREAD: 1908 // No cache to bypass. 1909 break; 1910 default: 1911 llvm_unreachable("Unsupported synchronization scope"); 1912 } 1913 } 1914 1915 /// The scratch address space does not need the global memory caches 1916 /// to be bypassed as all memory operations by the same thread are 1917 /// sequentially consistent, and no other thread can access scratch 1918 /// memory. 1919 1920 /// Other address spaces do not have a cache. 1921 1922 return Changed; 1923 } 1924 1925 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1926 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1927 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1928 1929 // Only handle load and store, not atomic read-modify-write insructions. The 1930 // latter use glc to indicate if the atomic returns a result and so must not 1931 // be used for cache control. 1932 assert(MI->mayLoad() ^ MI->mayStore()); 1933 1934 // Only update load and store, not LLVM IR atomic read-modify-write 1935 // instructions. The latter are always marked as volatile so cannot sensibly 1936 // handle it as do not want to pessimize all atomics. Also they do not support 1937 // the nontemporal attribute. 1938 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1939 1940 bool Changed = false; 1941 1942 if (IsVolatile) { 1943 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1944 // and MISS_LRU for store instructions. 1945 // Note: there is no L2 cache coherent bypass control at the ISA level. 1946 if (Op == SIMemOp::LOAD) { 1947 Changed |= enableGLCBit(MI); 1948 Changed |= enableDLCBit(MI); 1949 } 1950 1951 // Ensure operation has completed at system scope to cause all volatile 1952 // operations to be visible outside the program in a global order. Do not 1953 // request cross address space as only the global address space can be 1954 // observable outside the program, so no need to cause a waitcnt for LDS 1955 // address space operations. 1956 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1957 Position::AFTER, AtomicOrdering::Unordered); 1958 return Changed; 1959 } 1960 1961 if (IsNonTemporal) { 1962 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1963 // and L2 cache policy to STREAM. 1964 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1965 // to MISS_EVICT and the L2 cache policy to STREAM. 1966 if (Op == SIMemOp::STORE) 1967 Changed |= enableGLCBit(MI); 1968 Changed |= enableSLCBit(MI); 1969 1970 return Changed; 1971 } 1972 1973 return Changed; 1974 } 1975 1976 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1977 SIAtomicScope Scope, 1978 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1979 bool IsCrossAddrSpaceOrdering, 1980 Position Pos, AtomicOrdering Order) const { 1981 bool Changed = false; 1982 1983 MachineBasicBlock &MBB = *MI->getParent(); 1984 DebugLoc DL = MI->getDebugLoc(); 1985 1986 if (Pos == Position::AFTER) 1987 ++MI; 1988 1989 bool VMCnt = false; 1990 bool VSCnt = false; 1991 bool LGKMCnt = false; 1992 1993 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1994 SIAtomicAddrSpace::NONE) { 1995 switch (Scope) { 1996 case SIAtomicScope::SYSTEM: 1997 case SIAtomicScope::AGENT: 1998 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1999 VMCnt |= true; 2000 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2001 VSCnt |= true; 2002 break; 2003 case SIAtomicScope::WORKGROUP: 2004 // In WGP mode the waves of a work-group can be executing on either CU of 2005 // the WGP. Therefore need to wait for operations to complete to ensure 2006 // they are visible to waves in the other CU as the L0 is per CU. 2007 // Otherwise in CU mode and all waves of a work-group are on the same CU 2008 // which shares the same L0. 2009 if (!ST.isCuModeEnabled()) { 2010 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2011 VMCnt |= true; 2012 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2013 VSCnt |= true; 2014 } 2015 break; 2016 case SIAtomicScope::WAVEFRONT: 2017 case SIAtomicScope::SINGLETHREAD: 2018 // The L0 cache keeps all memory operations in order for 2019 // work-items in the same wavefront. 2020 break; 2021 default: 2022 llvm_unreachable("Unsupported synchronization scope"); 2023 } 2024 } 2025 2026 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2027 switch (Scope) { 2028 case SIAtomicScope::SYSTEM: 2029 case SIAtomicScope::AGENT: 2030 case SIAtomicScope::WORKGROUP: 2031 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2032 // not needed as LDS operations for all waves are executed in a total 2033 // global ordering as observed by all waves. Required if also 2034 // synchronizing with global/GDS memory as LDS operations could be 2035 // reordered with respect to later global/GDS memory operations of the 2036 // same wave. 2037 LGKMCnt |= IsCrossAddrSpaceOrdering; 2038 break; 2039 case SIAtomicScope::WAVEFRONT: 2040 case SIAtomicScope::SINGLETHREAD: 2041 // The LDS keeps all memory operations in order for 2042 // the same wavefront. 2043 break; 2044 default: 2045 llvm_unreachable("Unsupported synchronization scope"); 2046 } 2047 } 2048 2049 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 2050 switch (Scope) { 2051 case SIAtomicScope::SYSTEM: 2052 case SIAtomicScope::AGENT: 2053 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 2054 // is not needed as GDS operations for all waves are executed in a total 2055 // global ordering as observed by all waves. Required if also 2056 // synchronizing with global/LDS memory as GDS operations could be 2057 // reordered with respect to later global/LDS memory operations of the 2058 // same wave. 2059 LGKMCnt |= IsCrossAddrSpaceOrdering; 2060 break; 2061 case SIAtomicScope::WORKGROUP: 2062 case SIAtomicScope::WAVEFRONT: 2063 case SIAtomicScope::SINGLETHREAD: 2064 // The GDS keeps all memory operations in order for 2065 // the same work-group. 2066 break; 2067 default: 2068 llvm_unreachable("Unsupported synchronization scope"); 2069 } 2070 } 2071 2072 if (VMCnt || LGKMCnt) { 2073 unsigned WaitCntImmediate = 2074 AMDGPU::encodeWaitcnt(IV, 2075 VMCnt ? 0 : getVmcntBitMask(IV), 2076 getExpcntBitMask(IV), 2077 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 2078 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 2079 .addImm(WaitCntImmediate); 2080 Changed = true; 2081 } 2082 2083 if (VSCnt) { 2084 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 2085 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 2086 .addImm(0); 2087 Changed = true; 2088 } 2089 2090 if (Pos == Position::AFTER) 2091 --MI; 2092 2093 return Changed; 2094 } 2095 2096 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2097 SIAtomicScope Scope, 2098 SIAtomicAddrSpace AddrSpace, 2099 Position Pos) const { 2100 if (!InsertCacheInv) 2101 return false; 2102 2103 bool Changed = false; 2104 2105 MachineBasicBlock &MBB = *MI->getParent(); 2106 DebugLoc DL = MI->getDebugLoc(); 2107 2108 if (Pos == Position::AFTER) 2109 ++MI; 2110 2111 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2112 switch (Scope) { 2113 case SIAtomicScope::SYSTEM: 2114 case SIAtomicScope::AGENT: 2115 // The order of invalidates matter here. We must invalidate "outer in" 2116 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is 2117 // invalidated. 2118 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2119 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2120 Changed = true; 2121 break; 2122 case SIAtomicScope::WORKGROUP: 2123 // In WGP mode the waves of a work-group can be executing on either CU of 2124 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2125 // in CU mode and all waves of a work-group are on the same CU, and so the 2126 // L0 does not need to be invalidated. 2127 if (!ST.isCuModeEnabled()) { 2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2129 Changed = true; 2130 } 2131 break; 2132 case SIAtomicScope::WAVEFRONT: 2133 case SIAtomicScope::SINGLETHREAD: 2134 // No cache to invalidate. 2135 break; 2136 default: 2137 llvm_unreachable("Unsupported synchronization scope"); 2138 } 2139 } 2140 2141 /// The scratch address space does not need the global memory cache 2142 /// to be flushed as all memory operations by the same thread are 2143 /// sequentially consistent, and no other thread can access scratch 2144 /// memory. 2145 2146 /// Other address spaces do not have a cache. 2147 2148 if (Pos == Position::AFTER) 2149 --MI; 2150 2151 return Changed; 2152 } 2153 2154 bool SIGfx11CacheControl::enableLoadCacheBypass( 2155 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2156 SIAtomicAddrSpace AddrSpace) const { 2157 assert(MI->mayLoad() && !MI->mayStore()); 2158 bool Changed = false; 2159 2160 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2161 switch (Scope) { 2162 case SIAtomicScope::SYSTEM: 2163 case SIAtomicScope::AGENT: 2164 // Set the L0 and L1 cache policies to MISS_EVICT. 2165 // Note: there is no L2 cache coherent bypass control at the ISA level. 2166 Changed |= enableGLCBit(MI); 2167 break; 2168 case SIAtomicScope::WORKGROUP: 2169 // In WGP mode the waves of a work-group can be executing on either CU of 2170 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2171 // CU mode all waves of a work-group are on the same CU, and so the L0 2172 // does not need to be bypassed. 2173 if (!ST.isCuModeEnabled()) 2174 Changed |= enableGLCBit(MI); 2175 break; 2176 case SIAtomicScope::WAVEFRONT: 2177 case SIAtomicScope::SINGLETHREAD: 2178 // No cache to bypass. 2179 break; 2180 default: 2181 llvm_unreachable("Unsupported synchronization scope"); 2182 } 2183 } 2184 2185 /// The scratch address space does not need the global memory caches 2186 /// to be bypassed as all memory operations by the same thread are 2187 /// sequentially consistent, and no other thread can access scratch 2188 /// memory. 2189 2190 /// Other address spaces do not have a cache. 2191 2192 return Changed; 2193 } 2194 2195 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2196 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2197 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2198 2199 // Only handle load and store, not atomic read-modify-write insructions. The 2200 // latter use glc to indicate if the atomic returns a result and so must not 2201 // be used for cache control. 2202 assert(MI->mayLoad() ^ MI->mayStore()); 2203 2204 // Only update load and store, not LLVM IR atomic read-modify-write 2205 // instructions. The latter are always marked as volatile so cannot sensibly 2206 // handle it as do not want to pessimize all atomics. Also they do not support 2207 // the nontemporal attribute. 2208 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2209 2210 bool Changed = false; 2211 2212 if (IsVolatile) { 2213 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2214 // and MISS_LRU for store instructions. 2215 // Note: there is no L2 cache coherent bypass control at the ISA level. 2216 if (Op == SIMemOp::LOAD) 2217 Changed |= enableGLCBit(MI); 2218 2219 // Set MALL NOALLOC for load and store instructions. 2220 Changed |= enableDLCBit(MI); 2221 2222 // Ensure operation has completed at system scope to cause all volatile 2223 // operations to be visible outside the program in a global order. Do not 2224 // request cross address space as only the global address space can be 2225 // observable outside the program, so no need to cause a waitcnt for LDS 2226 // address space operations. 2227 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2228 Position::AFTER, AtomicOrdering::Unordered); 2229 return Changed; 2230 } 2231 2232 if (IsNonTemporal) { 2233 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2234 // and L2 cache policy to STREAM. 2235 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2236 // to MISS_EVICT and the L2 cache policy to STREAM. 2237 if (Op == SIMemOp::STORE) 2238 Changed |= enableGLCBit(MI); 2239 Changed |= enableSLCBit(MI); 2240 2241 // Set MALL NOALLOC for load and store instructions. 2242 Changed |= enableDLCBit(MI); 2243 return Changed; 2244 } 2245 2246 return Changed; 2247 } 2248 2249 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 2250 AMDGPU::CPol::CPol Value) const { 2251 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2252 if (!CPol) 2253 return false; 2254 2255 uint64_t NewTH = Value & AMDGPU::CPol::TH; 2256 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 2257 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 2258 return true; 2259 } 2260 2261 return false; 2262 } 2263 2264 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 2265 AMDGPU::CPol::CPol Value) const { 2266 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2267 if (!CPol) 2268 return false; 2269 2270 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 2271 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 2272 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 2273 return true; 2274 } 2275 2276 return false; 2277 } 2278 2279 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( 2280 const MachineBasicBlock::iterator MI) const { 2281 // TODO: implement flag for frontend to give us a hint not to insert waits. 2282 2283 MachineBasicBlock &MBB = *MI->getParent(); 2284 const DebugLoc &DL = MI->getDebugLoc(); 2285 2286 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); 2287 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); 2288 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); 2289 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); 2290 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); 2291 2292 return true; 2293 } 2294 2295 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 2296 SIAtomicScope Scope, 2297 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2298 bool IsCrossAddrSpaceOrdering, 2299 Position Pos, AtomicOrdering Order) const { 2300 bool Changed = false; 2301 2302 MachineBasicBlock &MBB = *MI->getParent(); 2303 DebugLoc DL = MI->getDebugLoc(); 2304 2305 bool LOADCnt = false; 2306 bool DSCnt = false; 2307 bool STORECnt = false; 2308 2309 if (Pos == Position::AFTER) 2310 ++MI; 2311 2312 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2313 SIAtomicAddrSpace::NONE) { 2314 switch (Scope) { 2315 case SIAtomicScope::SYSTEM: 2316 case SIAtomicScope::AGENT: 2317 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2318 LOADCnt |= true; 2319 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2320 STORECnt |= true; 2321 break; 2322 case SIAtomicScope::WORKGROUP: 2323 // In WGP mode the waves of a work-group can be executing on either CU of 2324 // the WGP. Therefore need to wait for operations to complete to ensure 2325 // they are visible to waves in the other CU as the L0 is per CU. 2326 // Otherwise in CU mode and all waves of a work-group are on the same CU 2327 // which shares the same L0. 2328 if (!ST.isCuModeEnabled()) { 2329 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2330 LOADCnt |= true; 2331 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2332 STORECnt |= true; 2333 } 2334 break; 2335 case SIAtomicScope::WAVEFRONT: 2336 case SIAtomicScope::SINGLETHREAD: 2337 // The L0 cache keeps all memory operations in order for 2338 // work-items in the same wavefront. 2339 break; 2340 default: 2341 llvm_unreachable("Unsupported synchronization scope"); 2342 } 2343 } 2344 2345 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2346 switch (Scope) { 2347 case SIAtomicScope::SYSTEM: 2348 case SIAtomicScope::AGENT: 2349 case SIAtomicScope::WORKGROUP: 2350 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2351 // not needed as LDS operations for all waves are executed in a total 2352 // global ordering as observed by all waves. Required if also 2353 // synchronizing with global/GDS memory as LDS operations could be 2354 // reordered with respect to later global/GDS memory operations of the 2355 // same wave. 2356 DSCnt |= IsCrossAddrSpaceOrdering; 2357 break; 2358 case SIAtomicScope::WAVEFRONT: 2359 case SIAtomicScope::SINGLETHREAD: 2360 // The LDS keeps all memory operations in order for 2361 // the same wavefront. 2362 break; 2363 default: 2364 llvm_unreachable("Unsupported synchronization scope"); 2365 } 2366 } 2367 2368 if (LOADCnt) { 2369 // Acquire sequences only need to wait on the previous atomic operation. 2370 // e.g. a typical sequence looks like 2371 // atomic load 2372 // (wait) 2373 // global_inv 2374 // 2375 // We do not have BVH or SAMPLE atomics, so the atomic load is always going 2376 // to be tracked using loadcnt. 2377 // 2378 // This also applies to fences. Fences cannot pair with an instruction 2379 // tracked with bvh/samplecnt as we don't have any atomics that do that. 2380 if (Order != AtomicOrdering::Acquire) { 2381 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 2382 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 2383 } 2384 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 2385 Changed = true; 2386 } 2387 2388 if (STORECnt) { 2389 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 2390 Changed = true; 2391 } 2392 2393 if (DSCnt) { 2394 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 2395 Changed = true; 2396 } 2397 2398 if (Pos == Position::AFTER) 2399 --MI; 2400 2401 return Changed; 2402 } 2403 2404 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2405 SIAtomicScope Scope, 2406 SIAtomicAddrSpace AddrSpace, 2407 Position Pos) const { 2408 if (!InsertCacheInv) 2409 return false; 2410 2411 MachineBasicBlock &MBB = *MI->getParent(); 2412 DebugLoc DL = MI->getDebugLoc(); 2413 2414 /// The scratch address space does not need the global memory cache 2415 /// to be flushed as all memory operations by the same thread are 2416 /// sequentially consistent, and no other thread can access scratch 2417 /// memory. 2418 2419 /// Other address spaces do not have a cache. 2420 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2421 return false; 2422 2423 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2424 switch (Scope) { 2425 case SIAtomicScope::SYSTEM: 2426 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2427 break; 2428 case SIAtomicScope::AGENT: 2429 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2430 break; 2431 case SIAtomicScope::WORKGROUP: 2432 // In WGP mode the waves of a work-group can be executing on either CU of 2433 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2434 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2435 // the L0 does not need to be invalidated. 2436 if (ST.isCuModeEnabled()) 2437 return false; 2438 2439 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2440 break; 2441 case SIAtomicScope::WAVEFRONT: 2442 case SIAtomicScope::SINGLETHREAD: 2443 // No cache to invalidate. 2444 return false; 2445 default: 2446 llvm_unreachable("Unsupported synchronization scope"); 2447 } 2448 2449 if (Pos == Position::AFTER) 2450 ++MI; 2451 2452 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2453 2454 if (Pos == Position::AFTER) 2455 --MI; 2456 2457 return true; 2458 } 2459 2460 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 2461 SIAtomicScope Scope, 2462 SIAtomicAddrSpace AddrSpace, 2463 bool IsCrossAddrSpaceOrdering, 2464 Position Pos) const { 2465 MachineBasicBlock &MBB = *MI->getParent(); 2466 DebugLoc DL = MI->getDebugLoc(); 2467 2468 // The scratch address space does not need the global memory cache 2469 // writeback as all memory operations by the same thread are 2470 // sequentially consistent, and no other thread can access scratch 2471 // memory. 2472 2473 // Other address spaces do not have a cache. 2474 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2475 return false; 2476 2477 if (Pos == Position::AFTER) 2478 ++MI; 2479 2480 // global_wb is only necessary at system scope for gfx120x targets. 2481 // 2482 // Emitting it for lower scopes is a slow no-op, so we omit it 2483 // for performance. 2484 switch (Scope) { 2485 case SIAtomicScope::SYSTEM: 2486 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) 2487 .addImm(AMDGPU::CPol::SCOPE_SYS); 2488 break; 2489 case SIAtomicScope::AGENT: 2490 case SIAtomicScope::WORKGROUP: 2491 // No WB necessary, but we still have to wait. 2492 break; 2493 case SIAtomicScope::WAVEFRONT: 2494 case SIAtomicScope::SINGLETHREAD: 2495 // No WB or wait necessary here. 2496 return false; 2497 default: 2498 llvm_unreachable("Unsupported synchronization scope"); 2499 } 2500 2501 if (Pos == Position::AFTER) 2502 --MI; 2503 2504 // We always have to wait for previous memory operations (load/store) to 2505 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), 2506 // we of course need to wait for that as well. 2507 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2508 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); 2509 2510 return true; 2511 } 2512 2513 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 2514 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2515 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2516 2517 // Only handle load and store, not atomic read-modify-write instructions. 2518 assert(MI->mayLoad() ^ MI->mayStore()); 2519 2520 // Only update load and store, not LLVM IR atomic read-modify-write 2521 // instructions. The latter are always marked as volatile so cannot sensibly 2522 // handle it as do not want to pessimize all atomics. Also they do not support 2523 // the nontemporal attribute. 2524 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2525 2526 bool Changed = false; 2527 2528 if (IsLastUse) { 2529 // Set last-use hint. 2530 Changed |= setTH(MI, AMDGPU::CPol::TH_LU); 2531 } else if (IsNonTemporal) { 2532 // Set non-temporal hint for all cache levels. 2533 Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 2534 } 2535 2536 if (IsVolatile) { 2537 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2538 2539 if (Op == SIMemOp::STORE) 2540 Changed |= insertWaitsBeforeSystemScopeStore(MI); 2541 2542 // Ensure operation has completed at system scope to cause all volatile 2543 // operations to be visible outside the program in a global order. Do not 2544 // request cross address space as only the global address space can be 2545 // observable outside the program, so no need to cause a waitcnt for LDS 2546 // address space operations. 2547 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2548 Position::AFTER, AtomicOrdering::Unordered); 2549 } 2550 2551 return Changed; 2552 } 2553 2554 bool SIGfx12CacheControl::expandSystemScopeStore( 2555 MachineBasicBlock::iterator &MI) const { 2556 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2557 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) 2558 return insertWaitsBeforeSystemScopeStore(MI); 2559 2560 return false; 2561 } 2562 2563 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, 2564 SIAtomicScope Scope, 2565 SIAtomicAddrSpace AddrSpace) const { 2566 bool Changed = false; 2567 2568 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2569 switch (Scope) { 2570 case SIAtomicScope::SYSTEM: 2571 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2572 break; 2573 case SIAtomicScope::AGENT: 2574 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); 2575 break; 2576 case SIAtomicScope::WORKGROUP: 2577 // In workgroup mode, SCOPE_SE is needed as waves can executes on 2578 // different CUs that access different L0s. 2579 if (!ST.isCuModeEnabled()) 2580 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); 2581 break; 2582 case SIAtomicScope::WAVEFRONT: 2583 case SIAtomicScope::SINGLETHREAD: 2584 // No cache to bypass. 2585 break; 2586 default: 2587 llvm_unreachable("Unsupported synchronization scope"); 2588 } 2589 } 2590 2591 // The scratch address space does not need the global memory caches 2592 // to be bypassed as all memory operations by the same thread are 2593 // sequentially consistent, and no other thread can access scratch 2594 // memory. 2595 2596 // Other address spaces do not have a cache. 2597 2598 return Changed; 2599 } 2600 2601 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2602 if (AtomicPseudoMIs.empty()) 2603 return false; 2604 2605 for (auto &MI : AtomicPseudoMIs) 2606 MI->eraseFromParent(); 2607 2608 AtomicPseudoMIs.clear(); 2609 return true; 2610 } 2611 2612 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2613 MachineBasicBlock::iterator &MI) { 2614 assert(MI->mayLoad() && !MI->mayStore()); 2615 2616 bool Changed = false; 2617 2618 if (MOI.isAtomic()) { 2619 const AtomicOrdering Order = MOI.getOrdering(); 2620 if (Order == AtomicOrdering::Monotonic || 2621 Order == AtomicOrdering::Acquire || 2622 Order == AtomicOrdering::SequentiallyConsistent) { 2623 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2624 MOI.getOrderingAddrSpace()); 2625 } 2626 2627 if (Order == AtomicOrdering::SequentiallyConsistent) 2628 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2629 SIMemOp::LOAD | SIMemOp::STORE, 2630 MOI.getIsCrossAddressSpaceOrdering(), 2631 Position::BEFORE, Order); 2632 2633 if (Order == AtomicOrdering::Acquire || 2634 Order == AtomicOrdering::SequentiallyConsistent) { 2635 Changed |= CC->insertWait( 2636 MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, 2637 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); 2638 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2639 MOI.getOrderingAddrSpace(), 2640 Position::AFTER); 2641 } 2642 2643 return Changed; 2644 } 2645 2646 // Atomic instructions already bypass caches to the scope specified by the 2647 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use 2648 // instructions need additional treatment. 2649 Changed |= CC->enableVolatileAndOrNonTemporal( 2650 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), 2651 MOI.isNonTemporal(), MOI.isLastUse()); 2652 2653 return Changed; 2654 } 2655 2656 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2657 MachineBasicBlock::iterator &MI) { 2658 assert(!MI->mayLoad() && MI->mayStore()); 2659 2660 bool Changed = false; 2661 2662 if (MOI.isAtomic()) { 2663 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2664 MOI.getOrdering() == AtomicOrdering::Release || 2665 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2666 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2667 MOI.getOrderingAddrSpace()); 2668 } 2669 2670 if (MOI.getOrdering() == AtomicOrdering::Release || 2671 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2672 Changed |= CC->insertRelease(MI, MOI.getScope(), 2673 MOI.getOrderingAddrSpace(), 2674 MOI.getIsCrossAddressSpaceOrdering(), 2675 Position::BEFORE); 2676 2677 return Changed; 2678 } 2679 2680 // Atomic instructions already bypass caches to the scope specified by the 2681 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2682 // need additional treatment. 2683 Changed |= CC->enableVolatileAndOrNonTemporal( 2684 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2685 MOI.isNonTemporal()); 2686 2687 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is 2688 // instruction field, do not confuse it with atomic scope. 2689 Changed |= CC->expandSystemScopeStore(MI); 2690 return Changed; 2691 } 2692 2693 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2694 MachineBasicBlock::iterator &MI) { 2695 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2696 2697 AtomicPseudoMIs.push_back(MI); 2698 bool Changed = false; 2699 2700 // Refine fenced address space based on MMRAs. 2701 // 2702 // TODO: Should we support this MMRA on other atomic operations? 2703 auto OrderingAddrSpace = 2704 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); 2705 2706 if (MOI.isAtomic()) { 2707 const AtomicOrdering Order = MOI.getOrdering(); 2708 if (Order == AtomicOrdering::Acquire) { 2709 Changed |= CC->insertWait( 2710 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2711 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order); 2712 } 2713 2714 if (Order == AtomicOrdering::Release || 2715 Order == AtomicOrdering::AcquireRelease || 2716 Order == AtomicOrdering::SequentiallyConsistent) 2717 /// TODO: This relies on a barrier always generating a waitcnt 2718 /// for LDS to ensure it is not reordered with the completion of 2719 /// the proceeding LDS operations. If barrier had a memory 2720 /// ordering and memory scope, then library does not need to 2721 /// generate a fence. Could add support in this file for 2722 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2723 /// adding S_WAITCNT before a S_BARRIER. 2724 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, 2725 MOI.getIsCrossAddressSpaceOrdering(), 2726 Position::BEFORE); 2727 2728 // TODO: If both release and invalidate are happening they could be combined 2729 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2730 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2731 // track cache invalidate and write back instructions. 2732 2733 if (Order == AtomicOrdering::Acquire || 2734 Order == AtomicOrdering::AcquireRelease || 2735 Order == AtomicOrdering::SequentiallyConsistent) 2736 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, 2737 Position::BEFORE); 2738 2739 return Changed; 2740 } 2741 2742 return Changed; 2743 } 2744 2745 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2746 MachineBasicBlock::iterator &MI) { 2747 assert(MI->mayLoad() && MI->mayStore()); 2748 2749 bool Changed = false; 2750 2751 if (MOI.isAtomic()) { 2752 const AtomicOrdering Order = MOI.getOrdering(); 2753 if (Order == AtomicOrdering::Monotonic || 2754 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release || 2755 Order == AtomicOrdering::AcquireRelease || 2756 Order == AtomicOrdering::SequentiallyConsistent) { 2757 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2758 MOI.getInstrAddrSpace()); 2759 } 2760 2761 if (Order == AtomicOrdering::Release || 2762 Order == AtomicOrdering::AcquireRelease || 2763 Order == AtomicOrdering::SequentiallyConsistent || 2764 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2765 Changed |= CC->insertRelease(MI, MOI.getScope(), 2766 MOI.getOrderingAddrSpace(), 2767 MOI.getIsCrossAddressSpaceOrdering(), 2768 Position::BEFORE); 2769 2770 if (Order == AtomicOrdering::Acquire || 2771 Order == AtomicOrdering::AcquireRelease || 2772 Order == AtomicOrdering::SequentiallyConsistent || 2773 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2774 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2775 Changed |= CC->insertWait( 2776 MI, MOI.getScope(), MOI.getInstrAddrSpace(), 2777 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, 2778 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); 2779 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2780 MOI.getOrderingAddrSpace(), 2781 Position::AFTER); 2782 } 2783 2784 return Changed; 2785 } 2786 2787 return Changed; 2788 } 2789 2790 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2791 bool Changed = false; 2792 2793 const MachineModuleInfo &MMI = 2794 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 2795 2796 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>()); 2797 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2798 2799 for (auto &MBB : MF) { 2800 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2801 2802 // Unbundle instructions after the post-RA scheduler. 2803 if (MI->isBundle() && MI->mayLoadOrStore()) { 2804 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2805 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2806 I != E && I->isBundledWithPred(); ++I) { 2807 I->unbundleFromPred(); 2808 for (MachineOperand &MO : I->operands()) 2809 if (MO.isReg()) 2810 MO.setIsInternalRead(false); 2811 } 2812 2813 MI->eraseFromParent(); 2814 MI = II->getIterator(); 2815 } 2816 2817 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2818 continue; 2819 2820 if (const auto &MOI = MOA.getLoadInfo(MI)) 2821 Changed |= expandLoad(*MOI, MI); 2822 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2823 Changed |= expandStore(*MOI, MI); 2824 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2825 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2826 Changed |= expandAtomicFence(*MOI, MI); 2827 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2828 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2829 } 2830 } 2831 2832 Changed |= removeAtomicPseudoMIs(); 2833 return Changed; 2834 } 2835 2836 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2837 2838 char SIMemoryLegalizer::ID = 0; 2839 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2840 2841 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2842 return new SIMemoryLegalizer(); 2843 } 2844