1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/TargetParser/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 355 MachineBasicBlock::iterator &MI) const { 356 return false; 357 } 358 }; 359 360 class SIGfx6CacheControl : public SICacheControl { 361 protected: 362 363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::GLC); 367 } 368 369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 370 /// is modified, false otherwise. 371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 372 return enableNamedBit(MI, AMDGPU::CPol::SLC); 373 } 374 375 public: 376 377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 378 379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace) const override; 390 391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 392 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 393 bool IsVolatile, 394 bool IsNonTemporal) const override; 395 396 bool insertWait(MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace, 399 SIMemOp Op, 400 bool IsCrossAddrSpaceOrdering, 401 Position Pos) const override; 402 403 bool insertAcquire(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 Position Pos) const override; 407 408 bool insertRelease(MachineBasicBlock::iterator &MI, 409 SIAtomicScope Scope, 410 SIAtomicAddrSpace AddrSpace, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 }; 414 415 class SIGfx7CacheControl : public SIGfx6CacheControl { 416 public: 417 418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 419 420 bool insertAcquire(MachineBasicBlock::iterator &MI, 421 SIAtomicScope Scope, 422 SIAtomicAddrSpace AddrSpace, 423 Position Pos) const override; 424 425 }; 426 427 class SIGfx90ACacheControl : public SIGfx7CacheControl { 428 public: 429 430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 431 432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace) const override; 443 444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 445 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 446 bool IsVolatile, 447 bool IsNonTemporal) const override; 448 449 bool insertWait(MachineBasicBlock::iterator &MI, 450 SIAtomicScope Scope, 451 SIAtomicAddrSpace AddrSpace, 452 SIMemOp Op, 453 bool IsCrossAddrSpaceOrdering, 454 Position Pos) const override; 455 456 bool insertAcquire(MachineBasicBlock::iterator &MI, 457 SIAtomicScope Scope, 458 SIAtomicAddrSpace AddrSpace, 459 Position Pos) const override; 460 461 bool insertRelease(MachineBasicBlock::iterator &MI, 462 SIAtomicScope Scope, 463 SIAtomicAddrSpace AddrSpace, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 }; 467 468 class SIGfx940CacheControl : public SIGfx90ACacheControl { 469 protected: 470 471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 472 /// is modified, false otherwise. 473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 474 return enableNamedBit(MI, AMDGPU::CPol::SC0); 475 } 476 477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 478 /// is modified, false otherwise. 479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 480 return enableNamedBit(MI, AMDGPU::CPol::SC1); 481 } 482 483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 484 /// is modified, false otherwise. 485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 486 return enableNamedBit(MI, AMDGPU::CPol::NT); 487 } 488 489 public: 490 491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 492 493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 502 SIAtomicScope Scope, 503 SIAtomicAddrSpace AddrSpace) const override; 504 505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 506 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 507 bool IsVolatile, 508 bool IsNonTemporal) const override; 509 510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 511 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 512 513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 515 Position Pos) const override; 516 517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 518 MachineBasicBlock::iterator &MI) const override { 519 bool Changed = false; 520 if (ST.hasForceStoreSC0SC1() && 521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 522 SIAtomicAddrSpace::GLOBAL | 523 SIAtomicAddrSpace::OTHER)) != 524 SIAtomicAddrSpace::NONE) { 525 Changed |= enableSC0Bit(MI); 526 Changed |= enableSC1Bit(MI); 527 } 528 return Changed; 529 } 530 }; 531 532 class SIGfx10CacheControl : public SIGfx7CacheControl { 533 protected: 534 535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 536 /// is modified, false otherwise. 537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 538 return enableNamedBit(MI, AMDGPU::CPol::DLC); 539 } 540 541 public: 542 543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 544 545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 546 SIAtomicScope Scope, 547 SIAtomicAddrSpace AddrSpace) const override; 548 549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 550 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 551 bool IsVolatile, 552 bool IsNonTemporal) const override; 553 554 bool insertWait(MachineBasicBlock::iterator &MI, 555 SIAtomicScope Scope, 556 SIAtomicAddrSpace AddrSpace, 557 SIMemOp Op, 558 bool IsCrossAddrSpaceOrdering, 559 Position Pos) const override; 560 561 bool insertAcquire(MachineBasicBlock::iterator &MI, 562 SIAtomicScope Scope, 563 SIAtomicAddrSpace AddrSpace, 564 Position Pos) const override; 565 }; 566 567 class SIGfx11CacheControl : public SIGfx10CacheControl { 568 public: 569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 570 571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 572 SIAtomicScope Scope, 573 SIAtomicAddrSpace AddrSpace) const override; 574 575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 576 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 577 bool IsVolatile, 578 bool IsNonTemporal) const override; 579 }; 580 581 class SIMemoryLegalizer final : public MachineFunctionPass { 582 private: 583 584 /// Cache Control. 585 std::unique_ptr<SICacheControl> CC = nullptr; 586 587 /// List of atomic pseudo instructions. 588 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 589 590 /// Return true iff instruction \p MI is a atomic instruction that 591 /// returns a result. 592 bool isAtomicRet(const MachineInstr &MI) const { 593 return SIInstrInfo::isAtomicRet(MI); 594 } 595 596 /// Removes all processed atomic pseudo instructions from the current 597 /// function. Returns true if current function is modified, false otherwise. 598 bool removeAtomicPseudoMIs(); 599 600 /// Expands load operation \p MI. Returns true if instructions are 601 /// added/deleted or \p MI is modified, false otherwise. 602 bool expandLoad(const SIMemOpInfo &MOI, 603 MachineBasicBlock::iterator &MI); 604 /// Expands store operation \p MI. Returns true if instructions are 605 /// added/deleted or \p MI is modified, false otherwise. 606 bool expandStore(const SIMemOpInfo &MOI, 607 MachineBasicBlock::iterator &MI); 608 /// Expands atomic fence operation \p MI. Returns true if 609 /// instructions are added/deleted or \p MI is modified, false otherwise. 610 bool expandAtomicFence(const SIMemOpInfo &MOI, 611 MachineBasicBlock::iterator &MI); 612 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 613 /// instructions are added/deleted or \p MI is modified, false otherwise. 614 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 615 MachineBasicBlock::iterator &MI); 616 617 public: 618 static char ID; 619 620 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 621 622 void getAnalysisUsage(AnalysisUsage &AU) const override { 623 AU.setPreservesCFG(); 624 MachineFunctionPass::getAnalysisUsage(AU); 625 } 626 627 StringRef getPassName() const override { 628 return PASS_NAME; 629 } 630 631 bool runOnMachineFunction(MachineFunction &MF) override; 632 }; 633 634 } // end namespace anonymous 635 636 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 637 const char *Msg) const { 638 const Function &Func = MI->getParent()->getParent()->getFunction(); 639 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 640 Func.getContext().diagnose(Diag); 641 } 642 643 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 644 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 645 SIAtomicAddrSpace InstrAddrSpace) const { 646 if (SSID == SyncScope::System) 647 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 648 if (SSID == MMI->getAgentSSID()) 649 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 650 if (SSID == MMI->getWorkgroupSSID()) 651 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 652 true); 653 if (SSID == MMI->getWavefrontSSID()) 654 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 655 true); 656 if (SSID == SyncScope::SingleThread) 657 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 658 true); 659 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 660 return std::tuple(SIAtomicScope::SYSTEM, 661 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 662 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 663 return std::tuple(SIAtomicScope::AGENT, 664 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 665 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 666 return std::tuple(SIAtomicScope::WORKGROUP, 667 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 668 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 669 return std::tuple(SIAtomicScope::WAVEFRONT, 670 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 671 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 672 return std::tuple(SIAtomicScope::SINGLETHREAD, 673 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 674 return std::nullopt; 675 } 676 677 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 678 if (AS == AMDGPUAS::FLAT_ADDRESS) 679 return SIAtomicAddrSpace::FLAT; 680 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 681 return SIAtomicAddrSpace::GLOBAL; 682 if (AS == AMDGPUAS::LOCAL_ADDRESS) 683 return SIAtomicAddrSpace::LDS; 684 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 685 return SIAtomicAddrSpace::SCRATCH; 686 if (AS == AMDGPUAS::REGION_ADDRESS) 687 return SIAtomicAddrSpace::GDS; 688 689 return SIAtomicAddrSpace::OTHER; 690 } 691 692 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 693 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 694 } 695 696 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 697 const MachineBasicBlock::iterator &MI) const { 698 assert(MI->getNumMemOperands() > 0); 699 700 SyncScope::ID SSID = SyncScope::SingleThread; 701 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 702 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 703 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 704 bool IsNonTemporal = true; 705 bool IsVolatile = false; 706 707 // Validator should check whether or not MMOs cover the entire set of 708 // locations accessed by the memory instruction. 709 for (const auto &MMO : MI->memoperands()) { 710 IsNonTemporal &= MMO->isNonTemporal(); 711 IsVolatile |= MMO->isVolatile(); 712 InstrAddrSpace |= 713 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 714 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 715 if (OpOrdering != AtomicOrdering::NotAtomic) { 716 const auto &IsSyncScopeInclusion = 717 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 718 if (!IsSyncScopeInclusion) { 719 reportUnsupported(MI, 720 "Unsupported non-inclusive atomic synchronization scope"); 721 return std::nullopt; 722 } 723 724 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 725 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 726 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 727 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 728 FailureOrdering = 729 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 730 } 731 } 732 733 SIAtomicScope Scope = SIAtomicScope::NONE; 734 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 735 bool IsCrossAddressSpaceOrdering = false; 736 if (Ordering != AtomicOrdering::NotAtomic) { 737 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 738 if (!ScopeOrNone) { 739 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 740 return std::nullopt; 741 } 742 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 743 *ScopeOrNone; 744 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 745 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 746 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 747 reportUnsupported(MI, "Unsupported atomic address space"); 748 return std::nullopt; 749 } 750 } 751 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 752 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 753 IsNonTemporal); 754 } 755 756 std::optional<SIMemOpInfo> 757 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 758 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 759 760 if (!(MI->mayLoad() && !MI->mayStore())) 761 return std::nullopt; 762 763 // Be conservative if there are no memory operands. 764 if (MI->getNumMemOperands() == 0) 765 return SIMemOpInfo(); 766 767 return constructFromMIWithMMO(MI); 768 } 769 770 std::optional<SIMemOpInfo> 771 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 772 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 773 774 if (!(!MI->mayLoad() && MI->mayStore())) 775 return std::nullopt; 776 777 // Be conservative if there are no memory operands. 778 if (MI->getNumMemOperands() == 0) 779 return SIMemOpInfo(); 780 781 return constructFromMIWithMMO(MI); 782 } 783 784 std::optional<SIMemOpInfo> 785 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 786 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 787 788 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 789 return std::nullopt; 790 791 AtomicOrdering Ordering = 792 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 793 794 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 795 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 796 if (!ScopeOrNone) { 797 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 798 return std::nullopt; 799 } 800 801 SIAtomicScope Scope = SIAtomicScope::NONE; 802 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 803 bool IsCrossAddressSpaceOrdering = false; 804 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 805 *ScopeOrNone; 806 807 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 808 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 809 reportUnsupported(MI, "Unsupported atomic address space"); 810 return std::nullopt; 811 } 812 813 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 814 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 815 } 816 817 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 818 const MachineBasicBlock::iterator &MI) const { 819 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 820 821 if (!(MI->mayLoad() && MI->mayStore())) 822 return std::nullopt; 823 824 // Be conservative if there are no memory operands. 825 if (MI->getNumMemOperands() == 0) 826 return SIMemOpInfo(); 827 828 return constructFromMIWithMMO(MI); 829 } 830 831 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 832 TII = ST.getInstrInfo(); 833 IV = getIsaVersion(ST.getCPU()); 834 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 835 } 836 837 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 838 AMDGPU::CPol::CPol Bit) const { 839 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 840 if (!CPol) 841 return false; 842 843 CPol->setImm(CPol->getImm() | Bit); 844 return true; 845 } 846 847 /* static */ 848 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 849 GCNSubtarget::Generation Generation = ST.getGeneration(); 850 if (ST.hasGFX940Insts()) 851 return std::make_unique<SIGfx940CacheControl>(ST); 852 if (ST.hasGFX90AInsts()) 853 return std::make_unique<SIGfx90ACacheControl>(ST); 854 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 855 return std::make_unique<SIGfx6CacheControl>(ST); 856 if (Generation < AMDGPUSubtarget::GFX10) 857 return std::make_unique<SIGfx7CacheControl>(ST); 858 if (Generation < AMDGPUSubtarget::GFX11) 859 return std::make_unique<SIGfx10CacheControl>(ST); 860 return std::make_unique<SIGfx11CacheControl>(ST); 861 } 862 863 bool SIGfx6CacheControl::enableLoadCacheBypass( 864 const MachineBasicBlock::iterator &MI, 865 SIAtomicScope Scope, 866 SIAtomicAddrSpace AddrSpace) const { 867 assert(MI->mayLoad() && !MI->mayStore()); 868 bool Changed = false; 869 870 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 871 switch (Scope) { 872 case SIAtomicScope::SYSTEM: 873 case SIAtomicScope::AGENT: 874 // Set L1 cache policy to MISS_EVICT. 875 // Note: there is no L2 cache bypass policy at the ISA level. 876 Changed |= enableGLCBit(MI); 877 break; 878 case SIAtomicScope::WORKGROUP: 879 case SIAtomicScope::WAVEFRONT: 880 case SIAtomicScope::SINGLETHREAD: 881 // No cache to bypass. 882 break; 883 default: 884 llvm_unreachable("Unsupported synchronization scope"); 885 } 886 } 887 888 /// The scratch address space does not need the global memory caches 889 /// to be bypassed as all memory operations by the same thread are 890 /// sequentially consistent, and no other thread can access scratch 891 /// memory. 892 893 /// Other address spaces do not have a cache. 894 895 return Changed; 896 } 897 898 bool SIGfx6CacheControl::enableStoreCacheBypass( 899 const MachineBasicBlock::iterator &MI, 900 SIAtomicScope Scope, 901 SIAtomicAddrSpace AddrSpace) const { 902 assert(!MI->mayLoad() && MI->mayStore()); 903 bool Changed = false; 904 905 /// The L1 cache is write through so does not need to be bypassed. There is no 906 /// bypass control for the L2 cache at the isa level. 907 908 return Changed; 909 } 910 911 bool SIGfx6CacheControl::enableRMWCacheBypass( 912 const MachineBasicBlock::iterator &MI, 913 SIAtomicScope Scope, 914 SIAtomicAddrSpace AddrSpace) const { 915 assert(MI->mayLoad() && MI->mayStore()); 916 bool Changed = false; 917 918 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 919 /// bypassed, and the GLC bit is instead used to indicate if they are 920 /// return or no-return. 921 /// Note: there is no L2 cache coherent bypass control at the ISA level. 922 923 return Changed; 924 } 925 926 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 927 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 928 bool IsVolatile, bool IsNonTemporal) const { 929 // Only handle load and store, not atomic read-modify-write insructions. The 930 // latter use glc to indicate if the atomic returns a result and so must not 931 // be used for cache control. 932 assert(MI->mayLoad() ^ MI->mayStore()); 933 934 // Only update load and store, not LLVM IR atomic read-modify-write 935 // instructions. The latter are always marked as volatile so cannot sensibly 936 // handle it as do not want to pessimize all atomics. Also they do not support 937 // the nontemporal attribute. 938 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 939 940 bool Changed = false; 941 942 if (IsVolatile) { 943 // Set L1 cache policy to be MISS_EVICT for load instructions 944 // and MISS_LRU for store instructions. 945 // Note: there is no L2 cache bypass policy at the ISA level. 946 if (Op == SIMemOp::LOAD) 947 Changed |= enableGLCBit(MI); 948 949 // Ensure operation has completed at system scope to cause all volatile 950 // operations to be visible outside the program in a global order. Do not 951 // request cross address space as only the global address space can be 952 // observable outside the program, so no need to cause a waitcnt for LDS 953 // address space operations. 954 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 955 Position::AFTER); 956 957 return Changed; 958 } 959 960 if (IsNonTemporal) { 961 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 962 // for both loads and stores, and the L2 cache policy to STREAM. 963 Changed |= enableGLCBit(MI); 964 Changed |= enableSLCBit(MI); 965 return Changed; 966 } 967 968 return Changed; 969 } 970 971 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 972 SIAtomicScope Scope, 973 SIAtomicAddrSpace AddrSpace, 974 SIMemOp Op, 975 bool IsCrossAddrSpaceOrdering, 976 Position Pos) const { 977 bool Changed = false; 978 979 MachineBasicBlock &MBB = *MI->getParent(); 980 DebugLoc DL = MI->getDebugLoc(); 981 982 if (Pos == Position::AFTER) 983 ++MI; 984 985 bool VMCnt = false; 986 bool LGKMCnt = false; 987 988 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 989 SIAtomicAddrSpace::NONE) { 990 switch (Scope) { 991 case SIAtomicScope::SYSTEM: 992 case SIAtomicScope::AGENT: 993 VMCnt |= true; 994 break; 995 case SIAtomicScope::WORKGROUP: 996 case SIAtomicScope::WAVEFRONT: 997 case SIAtomicScope::SINGLETHREAD: 998 // The L1 cache keeps all memory operations in order for 999 // wavefronts in the same work-group. 1000 break; 1001 default: 1002 llvm_unreachable("Unsupported synchronization scope"); 1003 } 1004 } 1005 1006 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1007 switch (Scope) { 1008 case SIAtomicScope::SYSTEM: 1009 case SIAtomicScope::AGENT: 1010 case SIAtomicScope::WORKGROUP: 1011 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1012 // not needed as LDS operations for all waves are executed in a total 1013 // global ordering as observed by all waves. Required if also 1014 // synchronizing with global/GDS memory as LDS operations could be 1015 // reordered with respect to later global/GDS memory operations of the 1016 // same wave. 1017 LGKMCnt |= IsCrossAddrSpaceOrdering; 1018 break; 1019 case SIAtomicScope::WAVEFRONT: 1020 case SIAtomicScope::SINGLETHREAD: 1021 // The LDS keeps all memory operations in order for 1022 // the same wavefront. 1023 break; 1024 default: 1025 llvm_unreachable("Unsupported synchronization scope"); 1026 } 1027 } 1028 1029 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1030 switch (Scope) { 1031 case SIAtomicScope::SYSTEM: 1032 case SIAtomicScope::AGENT: 1033 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1034 // is not needed as GDS operations for all waves are executed in a total 1035 // global ordering as observed by all waves. Required if also 1036 // synchronizing with global/LDS memory as GDS operations could be 1037 // reordered with respect to later global/LDS memory operations of the 1038 // same wave. 1039 LGKMCnt |= IsCrossAddrSpaceOrdering; 1040 break; 1041 case SIAtomicScope::WORKGROUP: 1042 case SIAtomicScope::WAVEFRONT: 1043 case SIAtomicScope::SINGLETHREAD: 1044 // The GDS keeps all memory operations in order for 1045 // the same work-group. 1046 break; 1047 default: 1048 llvm_unreachable("Unsupported synchronization scope"); 1049 } 1050 } 1051 1052 if (VMCnt || LGKMCnt) { 1053 unsigned WaitCntImmediate = 1054 AMDGPU::encodeWaitcnt(IV, 1055 VMCnt ? 0 : getVmcntBitMask(IV), 1056 getExpcntBitMask(IV), 1057 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1058 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1059 .addImm(WaitCntImmediate); 1060 Changed = true; 1061 } 1062 1063 if (Pos == Position::AFTER) 1064 --MI; 1065 1066 return Changed; 1067 } 1068 1069 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1070 SIAtomicScope Scope, 1071 SIAtomicAddrSpace AddrSpace, 1072 Position Pos) const { 1073 if (!InsertCacheInv) 1074 return false; 1075 1076 bool Changed = false; 1077 1078 MachineBasicBlock &MBB = *MI->getParent(); 1079 DebugLoc DL = MI->getDebugLoc(); 1080 1081 if (Pos == Position::AFTER) 1082 ++MI; 1083 1084 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1085 switch (Scope) { 1086 case SIAtomicScope::SYSTEM: 1087 case SIAtomicScope::AGENT: 1088 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1089 Changed = true; 1090 break; 1091 case SIAtomicScope::WORKGROUP: 1092 case SIAtomicScope::WAVEFRONT: 1093 case SIAtomicScope::SINGLETHREAD: 1094 // No cache to invalidate. 1095 break; 1096 default: 1097 llvm_unreachable("Unsupported synchronization scope"); 1098 } 1099 } 1100 1101 /// The scratch address space does not need the global memory cache 1102 /// to be flushed as all memory operations by the same thread are 1103 /// sequentially consistent, and no other thread can access scratch 1104 /// memory. 1105 1106 /// Other address spaces do not have a cache. 1107 1108 if (Pos == Position::AFTER) 1109 --MI; 1110 1111 return Changed; 1112 } 1113 1114 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1115 SIAtomicScope Scope, 1116 SIAtomicAddrSpace AddrSpace, 1117 bool IsCrossAddrSpaceOrdering, 1118 Position Pos) const { 1119 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1120 IsCrossAddrSpaceOrdering, Pos); 1121 } 1122 1123 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1124 SIAtomicScope Scope, 1125 SIAtomicAddrSpace AddrSpace, 1126 Position Pos) const { 1127 if (!InsertCacheInv) 1128 return false; 1129 1130 bool Changed = false; 1131 1132 MachineBasicBlock &MBB = *MI->getParent(); 1133 DebugLoc DL = MI->getDebugLoc(); 1134 1135 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1136 1137 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1138 ? AMDGPU::BUFFER_WBINVL1 1139 : AMDGPU::BUFFER_WBINVL1_VOL; 1140 1141 if (Pos == Position::AFTER) 1142 ++MI; 1143 1144 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1145 switch (Scope) { 1146 case SIAtomicScope::SYSTEM: 1147 case SIAtomicScope::AGENT: 1148 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1149 Changed = true; 1150 break; 1151 case SIAtomicScope::WORKGROUP: 1152 case SIAtomicScope::WAVEFRONT: 1153 case SIAtomicScope::SINGLETHREAD: 1154 // No cache to invalidate. 1155 break; 1156 default: 1157 llvm_unreachable("Unsupported synchronization scope"); 1158 } 1159 } 1160 1161 /// The scratch address space does not need the global memory cache 1162 /// to be flushed as all memory operations by the same thread are 1163 /// sequentially consistent, and no other thread can access scratch 1164 /// memory. 1165 1166 /// Other address spaces do not have a cache. 1167 1168 if (Pos == Position::AFTER) 1169 --MI; 1170 1171 return Changed; 1172 } 1173 1174 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1175 const MachineBasicBlock::iterator &MI, 1176 SIAtomicScope Scope, 1177 SIAtomicAddrSpace AddrSpace) const { 1178 assert(MI->mayLoad() && !MI->mayStore()); 1179 bool Changed = false; 1180 1181 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1182 switch (Scope) { 1183 case SIAtomicScope::SYSTEM: 1184 case SIAtomicScope::AGENT: 1185 // Set the L1 cache policy to MISS_LRU. 1186 // Note: there is no L2 cache bypass policy at the ISA level. 1187 Changed |= enableGLCBit(MI); 1188 break; 1189 case SIAtomicScope::WORKGROUP: 1190 // In threadgroup split mode the waves of a work-group can be executing on 1191 // different CUs. Therefore need to bypass the L1 which is per CU. 1192 // Otherwise in non-threadgroup split mode all waves of a work-group are 1193 // on the same CU, and so the L1 does not need to be bypassed. 1194 if (ST.isTgSplitEnabled()) 1195 Changed |= enableGLCBit(MI); 1196 break; 1197 case SIAtomicScope::WAVEFRONT: 1198 case SIAtomicScope::SINGLETHREAD: 1199 // No cache to bypass. 1200 break; 1201 default: 1202 llvm_unreachable("Unsupported synchronization scope"); 1203 } 1204 } 1205 1206 /// The scratch address space does not need the global memory caches 1207 /// to be bypassed as all memory operations by the same thread are 1208 /// sequentially consistent, and no other thread can access scratch 1209 /// memory. 1210 1211 /// Other address spaces do not have a cache. 1212 1213 return Changed; 1214 } 1215 1216 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1217 const MachineBasicBlock::iterator &MI, 1218 SIAtomicScope Scope, 1219 SIAtomicAddrSpace AddrSpace) const { 1220 assert(!MI->mayLoad() && MI->mayStore()); 1221 bool Changed = false; 1222 1223 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1224 switch (Scope) { 1225 case SIAtomicScope::SYSTEM: 1226 case SIAtomicScope::AGENT: 1227 /// Do not set glc for store atomic operations as they implicitly write 1228 /// through the L1 cache. 1229 break; 1230 case SIAtomicScope::WORKGROUP: 1231 case SIAtomicScope::WAVEFRONT: 1232 case SIAtomicScope::SINGLETHREAD: 1233 // No cache to bypass. Store atomics implicitly write through the L1 1234 // cache. 1235 break; 1236 default: 1237 llvm_unreachable("Unsupported synchronization scope"); 1238 } 1239 } 1240 1241 /// The scratch address space does not need the global memory caches 1242 /// to be bypassed as all memory operations by the same thread are 1243 /// sequentially consistent, and no other thread can access scratch 1244 /// memory. 1245 1246 /// Other address spaces do not have a cache. 1247 1248 return Changed; 1249 } 1250 1251 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1252 const MachineBasicBlock::iterator &MI, 1253 SIAtomicScope Scope, 1254 SIAtomicAddrSpace AddrSpace) const { 1255 assert(MI->mayLoad() && MI->mayStore()); 1256 bool Changed = false; 1257 1258 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1259 switch (Scope) { 1260 case SIAtomicScope::SYSTEM: 1261 case SIAtomicScope::AGENT: 1262 /// Do not set glc for RMW atomic operations as they implicitly bypass 1263 /// the L1 cache, and the glc bit is instead used to indicate if they are 1264 /// return or no-return. 1265 break; 1266 case SIAtomicScope::WORKGROUP: 1267 case SIAtomicScope::WAVEFRONT: 1268 case SIAtomicScope::SINGLETHREAD: 1269 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1270 break; 1271 default: 1272 llvm_unreachable("Unsupported synchronization scope"); 1273 } 1274 } 1275 1276 return Changed; 1277 } 1278 1279 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1280 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1281 bool IsVolatile, bool IsNonTemporal) const { 1282 // Only handle load and store, not atomic read-modify-write insructions. The 1283 // latter use glc to indicate if the atomic returns a result and so must not 1284 // be used for cache control. 1285 assert(MI->mayLoad() ^ MI->mayStore()); 1286 1287 // Only update load and store, not LLVM IR atomic read-modify-write 1288 // instructions. The latter are always marked as volatile so cannot sensibly 1289 // handle it as do not want to pessimize all atomics. Also they do not support 1290 // the nontemporal attribute. 1291 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1292 1293 bool Changed = false; 1294 1295 if (IsVolatile) { 1296 // Set L1 cache policy to be MISS_EVICT for load instructions 1297 // and MISS_LRU for store instructions. 1298 // Note: there is no L2 cache bypass policy at the ISA level. 1299 if (Op == SIMemOp::LOAD) 1300 Changed |= enableGLCBit(MI); 1301 1302 // Ensure operation has completed at system scope to cause all volatile 1303 // operations to be visible outside the program in a global order. Do not 1304 // request cross address space as only the global address space can be 1305 // observable outside the program, so no need to cause a waitcnt for LDS 1306 // address space operations. 1307 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1308 Position::AFTER); 1309 1310 return Changed; 1311 } 1312 1313 if (IsNonTemporal) { 1314 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1315 // for both loads and stores, and the L2 cache policy to STREAM. 1316 Changed |= enableGLCBit(MI); 1317 Changed |= enableSLCBit(MI); 1318 return Changed; 1319 } 1320 1321 return Changed; 1322 } 1323 1324 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1325 SIAtomicScope Scope, 1326 SIAtomicAddrSpace AddrSpace, 1327 SIMemOp Op, 1328 bool IsCrossAddrSpaceOrdering, 1329 Position Pos) const { 1330 if (ST.isTgSplitEnabled()) { 1331 // In threadgroup split mode the waves of a work-group can be executing on 1332 // different CUs. Therefore need to wait for global or GDS memory operations 1333 // to complete to ensure they are visible to waves in the other CUs. 1334 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1335 // the same CU, so no need to wait for global memory as all waves in the 1336 // work-group access the same the L1, nor wait for GDS as access are ordered 1337 // on a CU. 1338 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1339 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1340 (Scope == SIAtomicScope::WORKGROUP)) { 1341 // Same as GFX7 using agent scope. 1342 Scope = SIAtomicScope::AGENT; 1343 } 1344 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1345 // LDS memory operations. 1346 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1347 } 1348 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1349 IsCrossAddrSpaceOrdering, Pos); 1350 } 1351 1352 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1353 SIAtomicScope Scope, 1354 SIAtomicAddrSpace AddrSpace, 1355 Position Pos) const { 1356 if (!InsertCacheInv) 1357 return false; 1358 1359 bool Changed = false; 1360 1361 MachineBasicBlock &MBB = *MI->getParent(); 1362 DebugLoc DL = MI->getDebugLoc(); 1363 1364 if (Pos == Position::AFTER) 1365 ++MI; 1366 1367 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1368 switch (Scope) { 1369 case SIAtomicScope::SYSTEM: 1370 // Ensures that following loads will not see stale remote VMEM data or 1371 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1372 // CC will never be stale due to the local memory probes. 1373 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1374 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1375 // hardware does not reorder memory operations by the same wave with 1376 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1377 // remove any cache lines of earlier writes by the same wave and ensures 1378 // later reads by the same wave will refetch the cache lines. 1379 Changed = true; 1380 break; 1381 case SIAtomicScope::AGENT: 1382 // Same as GFX7. 1383 break; 1384 case SIAtomicScope::WORKGROUP: 1385 // In threadgroup split mode the waves of a work-group can be executing on 1386 // different CUs. Therefore need to invalidate the L1 which is per CU. 1387 // Otherwise in non-threadgroup split mode all waves of a work-group are 1388 // on the same CU, and so the L1 does not need to be invalidated. 1389 if (ST.isTgSplitEnabled()) { 1390 // Same as GFX7 using agent scope. 1391 Scope = SIAtomicScope::AGENT; 1392 } 1393 break; 1394 case SIAtomicScope::WAVEFRONT: 1395 case SIAtomicScope::SINGLETHREAD: 1396 // Same as GFX7. 1397 break; 1398 default: 1399 llvm_unreachable("Unsupported synchronization scope"); 1400 } 1401 } 1402 1403 /// The scratch address space does not need the global memory cache 1404 /// to be flushed as all memory operations by the same thread are 1405 /// sequentially consistent, and no other thread can access scratch 1406 /// memory. 1407 1408 /// Other address spaces do not have a cache. 1409 1410 if (Pos == Position::AFTER) 1411 --MI; 1412 1413 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1414 1415 return Changed; 1416 } 1417 1418 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1419 SIAtomicScope Scope, 1420 SIAtomicAddrSpace AddrSpace, 1421 bool IsCrossAddrSpaceOrdering, 1422 Position Pos) const { 1423 bool Changed = false; 1424 1425 MachineBasicBlock &MBB = *MI->getParent(); 1426 DebugLoc DL = MI->getDebugLoc(); 1427 1428 if (Pos == Position::AFTER) 1429 ++MI; 1430 1431 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1432 switch (Scope) { 1433 case SIAtomicScope::SYSTEM: 1434 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1435 // hardware does not reorder memory operations by the same wave with 1436 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1437 // to initiate writeback of any dirty cache lines of earlier writes by the 1438 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1439 // writeback has completed. 1440 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1441 // Set SC bits to indicate system scope. 1442 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1443 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1444 // vmcnt(0)" needed by the "BUFFER_WBL2". 1445 Changed = true; 1446 break; 1447 case SIAtomicScope::AGENT: 1448 case SIAtomicScope::WORKGROUP: 1449 case SIAtomicScope::WAVEFRONT: 1450 case SIAtomicScope::SINGLETHREAD: 1451 // Same as GFX7. 1452 break; 1453 default: 1454 llvm_unreachable("Unsupported synchronization scope"); 1455 } 1456 } 1457 1458 if (Pos == Position::AFTER) 1459 --MI; 1460 1461 Changed |= 1462 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1463 IsCrossAddrSpaceOrdering, Pos); 1464 1465 return Changed; 1466 } 1467 1468 bool SIGfx940CacheControl::enableLoadCacheBypass( 1469 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1470 SIAtomicAddrSpace AddrSpace) const { 1471 assert(MI->mayLoad() && !MI->mayStore()); 1472 bool Changed = false; 1473 1474 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1475 switch (Scope) { 1476 case SIAtomicScope::SYSTEM: 1477 // Set SC bits to indicate system scope. 1478 Changed |= enableSC0Bit(MI); 1479 Changed |= enableSC1Bit(MI); 1480 break; 1481 case SIAtomicScope::AGENT: 1482 // Set SC bits to indicate agent scope. 1483 Changed |= enableSC1Bit(MI); 1484 break; 1485 case SIAtomicScope::WORKGROUP: 1486 // In threadgroup split mode the waves of a work-group can be executing on 1487 // different CUs. Therefore need to bypass the L1 which is per CU. 1488 // Otherwise in non-threadgroup split mode all waves of a work-group are 1489 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1490 // bits to indicate work-group scope will do this automatically. 1491 Changed |= enableSC0Bit(MI); 1492 break; 1493 case SIAtomicScope::WAVEFRONT: 1494 case SIAtomicScope::SINGLETHREAD: 1495 // Leave SC bits unset to indicate wavefront scope. 1496 break; 1497 default: 1498 llvm_unreachable("Unsupported synchronization scope"); 1499 } 1500 } 1501 1502 /// The scratch address space does not need the global memory caches 1503 /// to be bypassed as all memory operations by the same thread are 1504 /// sequentially consistent, and no other thread can access scratch 1505 /// memory. 1506 1507 /// Other address spaces do not have a cache. 1508 1509 return Changed; 1510 } 1511 1512 bool SIGfx940CacheControl::enableStoreCacheBypass( 1513 const MachineBasicBlock::iterator &MI, 1514 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1515 assert(!MI->mayLoad() && MI->mayStore()); 1516 bool Changed = false; 1517 1518 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1519 switch (Scope) { 1520 case SIAtomicScope::SYSTEM: 1521 // Set SC bits to indicate system scope. 1522 Changed |= enableSC0Bit(MI); 1523 Changed |= enableSC1Bit(MI); 1524 break; 1525 case SIAtomicScope::AGENT: 1526 // Set SC bits to indicate agent scope. 1527 Changed |= enableSC1Bit(MI); 1528 break; 1529 case SIAtomicScope::WORKGROUP: 1530 // Set SC bits to indicate workgroup scope. 1531 Changed |= enableSC0Bit(MI); 1532 break; 1533 case SIAtomicScope::WAVEFRONT: 1534 case SIAtomicScope::SINGLETHREAD: 1535 // Leave SC bits unset to indicate wavefront scope. 1536 break; 1537 default: 1538 llvm_unreachable("Unsupported synchronization scope"); 1539 } 1540 } 1541 1542 /// The scratch address space does not need the global memory caches 1543 /// to be bypassed as all memory operations by the same thread are 1544 /// sequentially consistent, and no other thread can access scratch 1545 /// memory. 1546 1547 /// Other address spaces do not have a cache. 1548 1549 return Changed; 1550 } 1551 1552 bool SIGfx940CacheControl::enableRMWCacheBypass( 1553 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1554 SIAtomicAddrSpace AddrSpace) const { 1555 assert(MI->mayLoad() && MI->mayStore()); 1556 bool Changed = false; 1557 1558 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1559 switch (Scope) { 1560 case SIAtomicScope::SYSTEM: 1561 // Set SC1 bit to indicate system scope. 1562 Changed |= enableSC1Bit(MI); 1563 break; 1564 case SIAtomicScope::AGENT: 1565 case SIAtomicScope::WORKGROUP: 1566 case SIAtomicScope::WAVEFRONT: 1567 case SIAtomicScope::SINGLETHREAD: 1568 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1569 // to indicate system or agent scope. The SC0 bit is used to indicate if 1570 // they are return or no-return. Leave SC1 bit unset to indicate agent 1571 // scope. 1572 break; 1573 default: 1574 llvm_unreachable("Unsupported synchronization scope"); 1575 } 1576 } 1577 1578 return Changed; 1579 } 1580 1581 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1582 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1583 bool IsVolatile, bool IsNonTemporal) const { 1584 // Only handle load and store, not atomic read-modify-write insructions. The 1585 // latter use glc to indicate if the atomic returns a result and so must not 1586 // be used for cache control. 1587 assert(MI->mayLoad() ^ MI->mayStore()); 1588 1589 // Only update load and store, not LLVM IR atomic read-modify-write 1590 // instructions. The latter are always marked as volatile so cannot sensibly 1591 // handle it as do not want to pessimize all atomics. Also they do not support 1592 // the nontemporal attribute. 1593 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1594 1595 bool Changed = false; 1596 1597 if (IsVolatile) { 1598 // Set SC bits to indicate system scope. 1599 Changed |= enableSC0Bit(MI); 1600 Changed |= enableSC1Bit(MI); 1601 1602 // Ensure operation has completed at system scope to cause all volatile 1603 // operations to be visible outside the program in a global order. Do not 1604 // request cross address space as only the global address space can be 1605 // observable outside the program, so no need to cause a waitcnt for LDS 1606 // address space operations. 1607 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1608 Position::AFTER); 1609 1610 return Changed; 1611 } 1612 1613 if (IsNonTemporal) { 1614 Changed |= enableNTBit(MI); 1615 return Changed; 1616 } 1617 1618 return Changed; 1619 } 1620 1621 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1622 SIAtomicScope Scope, 1623 SIAtomicAddrSpace AddrSpace, 1624 Position Pos) const { 1625 if (!InsertCacheInv) 1626 return false; 1627 1628 bool Changed = false; 1629 1630 MachineBasicBlock &MBB = *MI->getParent(); 1631 DebugLoc DL = MI->getDebugLoc(); 1632 1633 if (Pos == Position::AFTER) 1634 ++MI; 1635 1636 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1637 switch (Scope) { 1638 case SIAtomicScope::SYSTEM: 1639 // Ensures that following loads will not see stale remote VMEM data or 1640 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1641 // CC will never be stale due to the local memory probes. 1642 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1643 // Set SC bits to indicate system scope. 1644 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1645 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1646 // hardware does not reorder memory operations by the same wave with 1647 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1648 // remove any cache lines of earlier writes by the same wave and ensures 1649 // later reads by the same wave will refetch the cache lines. 1650 Changed = true; 1651 break; 1652 case SIAtomicScope::AGENT: 1653 // Ensures that following loads will not see stale remote date or local 1654 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1655 // due to the memory probes. 1656 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1657 // Set SC bits to indicate agent scope. 1658 .addImm(AMDGPU::CPol::SC1); 1659 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1660 // does not reorder memory operations with respect to preceeding buffer 1661 // invalidate. The invalidate is guaranteed to remove any cache lines of 1662 // earlier writes and ensures later writes will refetch the cache lines. 1663 Changed = true; 1664 break; 1665 case SIAtomicScope::WORKGROUP: 1666 // In threadgroup split mode the waves of a work-group can be executing on 1667 // different CUs. Therefore need to invalidate the L1 which is per CU. 1668 // Otherwise in non-threadgroup split mode all waves of a work-group are 1669 // on the same CU, and so the L1 does not need to be invalidated. 1670 if (ST.isTgSplitEnabled()) { 1671 // Ensures L1 is invalidated if in threadgroup split mode. In 1672 // non-threadgroup split mode it is a NOP, but no point generating it in 1673 // that case if know not in that mode. 1674 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1675 // Set SC bits to indicate work-group scope. 1676 .addImm(AMDGPU::CPol::SC0); 1677 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1678 // does not reorder memory operations with respect to preceeding buffer 1679 // invalidate. The invalidate is guaranteed to remove any cache lines of 1680 // earlier writes and ensures later writes will refetch the cache lines. 1681 Changed = true; 1682 } 1683 break; 1684 case SIAtomicScope::WAVEFRONT: 1685 case SIAtomicScope::SINGLETHREAD: 1686 // Could generate "BUFFER_INV" but it would do nothing as there are no 1687 // caches to invalidate. 1688 break; 1689 default: 1690 llvm_unreachable("Unsupported synchronization scope"); 1691 } 1692 } 1693 1694 /// The scratch address space does not need the global memory cache 1695 /// to be flushed as all memory operations by the same thread are 1696 /// sequentially consistent, and no other thread can access scratch 1697 /// memory. 1698 1699 /// Other address spaces do not have a cache. 1700 1701 if (Pos == Position::AFTER) 1702 --MI; 1703 1704 return Changed; 1705 } 1706 1707 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1708 SIAtomicScope Scope, 1709 SIAtomicAddrSpace AddrSpace, 1710 bool IsCrossAddrSpaceOrdering, 1711 Position Pos) const { 1712 bool Changed = false; 1713 1714 MachineBasicBlock &MBB = *MI->getParent(); 1715 DebugLoc DL = MI->getDebugLoc(); 1716 1717 if (Pos == Position::AFTER) 1718 ++MI; 1719 1720 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1721 switch (Scope) { 1722 case SIAtomicScope::SYSTEM: 1723 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1724 // hardware does not reorder memory operations by the same wave with 1725 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1726 // to initiate writeback of any dirty cache lines of earlier writes by the 1727 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1728 // writeback has completed. 1729 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1730 // Set SC bits to indicate system scope. 1731 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1732 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1733 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1734 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1735 Changed = true; 1736 break; 1737 case SIAtomicScope::AGENT: 1738 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1739 // Set SC bits to indicate agent scope. 1740 .addImm(AMDGPU::CPol::SC1); 1741 1742 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1743 // SIAtomicScope::AGENT, the following insertWait will generate the 1744 // required "S_WAITCNT vmcnt(0)". 1745 Changed = true; 1746 break; 1747 case SIAtomicScope::WORKGROUP: 1748 case SIAtomicScope::WAVEFRONT: 1749 case SIAtomicScope::SINGLETHREAD: 1750 // Do not generate "BUFFER_WBL2" as there are no caches it would 1751 // writeback, and would require an otherwise unnecessary 1752 // "S_WAITCNT vmcnt(0)". 1753 break; 1754 default: 1755 llvm_unreachable("Unsupported synchronization scope"); 1756 } 1757 } 1758 1759 if (Pos == Position::AFTER) 1760 --MI; 1761 1762 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1763 // S_WAITCNT needed. 1764 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1765 IsCrossAddrSpaceOrdering, Pos); 1766 1767 return Changed; 1768 } 1769 1770 bool SIGfx10CacheControl::enableLoadCacheBypass( 1771 const MachineBasicBlock::iterator &MI, 1772 SIAtomicScope Scope, 1773 SIAtomicAddrSpace AddrSpace) const { 1774 assert(MI->mayLoad() && !MI->mayStore()); 1775 bool Changed = false; 1776 1777 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1778 switch (Scope) { 1779 case SIAtomicScope::SYSTEM: 1780 case SIAtomicScope::AGENT: 1781 // Set the L0 and L1 cache policies to MISS_EVICT. 1782 // Note: there is no L2 cache coherent bypass control at the ISA level. 1783 Changed |= enableGLCBit(MI); 1784 Changed |= enableDLCBit(MI); 1785 break; 1786 case SIAtomicScope::WORKGROUP: 1787 // In WGP mode the waves of a work-group can be executing on either CU of 1788 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1789 // CU mode all waves of a work-group are on the same CU, and so the L0 1790 // does not need to be bypassed. 1791 if (!ST.isCuModeEnabled()) 1792 Changed |= enableGLCBit(MI); 1793 break; 1794 case SIAtomicScope::WAVEFRONT: 1795 case SIAtomicScope::SINGLETHREAD: 1796 // No cache to bypass. 1797 break; 1798 default: 1799 llvm_unreachable("Unsupported synchronization scope"); 1800 } 1801 } 1802 1803 /// The scratch address space does not need the global memory caches 1804 /// to be bypassed as all memory operations by the same thread are 1805 /// sequentially consistent, and no other thread can access scratch 1806 /// memory. 1807 1808 /// Other address spaces do not have a cache. 1809 1810 return Changed; 1811 } 1812 1813 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1814 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1815 bool IsVolatile, bool IsNonTemporal) const { 1816 1817 // Only handle load and store, not atomic read-modify-write insructions. The 1818 // latter use glc to indicate if the atomic returns a result and so must not 1819 // be used for cache control. 1820 assert(MI->mayLoad() ^ MI->mayStore()); 1821 1822 // Only update load and store, not LLVM IR atomic read-modify-write 1823 // instructions. The latter are always marked as volatile so cannot sensibly 1824 // handle it as do not want to pessimize all atomics. Also they do not support 1825 // the nontemporal attribute. 1826 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1827 1828 bool Changed = false; 1829 1830 if (IsVolatile) { 1831 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1832 // and MISS_LRU for store instructions. 1833 // Note: there is no L2 cache coherent bypass control at the ISA level. 1834 if (Op == SIMemOp::LOAD) { 1835 Changed |= enableGLCBit(MI); 1836 Changed |= enableDLCBit(MI); 1837 } 1838 1839 // Ensure operation has completed at system scope to cause all volatile 1840 // operations to be visible outside the program in a global order. Do not 1841 // request cross address space as only the global address space can be 1842 // observable outside the program, so no need to cause a waitcnt for LDS 1843 // address space operations. 1844 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1845 Position::AFTER); 1846 return Changed; 1847 } 1848 1849 if (IsNonTemporal) { 1850 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1851 // and L2 cache policy to STREAM. 1852 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1853 // to MISS_EVICT and the L2 cache policy to STREAM. 1854 if (Op == SIMemOp::STORE) 1855 Changed |= enableGLCBit(MI); 1856 Changed |= enableSLCBit(MI); 1857 1858 return Changed; 1859 } 1860 1861 return Changed; 1862 } 1863 1864 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1865 SIAtomicScope Scope, 1866 SIAtomicAddrSpace AddrSpace, 1867 SIMemOp Op, 1868 bool IsCrossAddrSpaceOrdering, 1869 Position Pos) const { 1870 bool Changed = false; 1871 1872 MachineBasicBlock &MBB = *MI->getParent(); 1873 DebugLoc DL = MI->getDebugLoc(); 1874 1875 if (Pos == Position::AFTER) 1876 ++MI; 1877 1878 bool VMCnt = false; 1879 bool VSCnt = false; 1880 bool LGKMCnt = false; 1881 1882 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1883 SIAtomicAddrSpace::NONE) { 1884 switch (Scope) { 1885 case SIAtomicScope::SYSTEM: 1886 case SIAtomicScope::AGENT: 1887 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1888 VMCnt |= true; 1889 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1890 VSCnt |= true; 1891 break; 1892 case SIAtomicScope::WORKGROUP: 1893 // In WGP mode the waves of a work-group can be executing on either CU of 1894 // the WGP. Therefore need to wait for operations to complete to ensure 1895 // they are visible to waves in the other CU as the L0 is per CU. 1896 // Otherwise in CU mode and all waves of a work-group are on the same CU 1897 // which shares the same L0. 1898 if (!ST.isCuModeEnabled()) { 1899 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1900 VMCnt |= true; 1901 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1902 VSCnt |= true; 1903 } 1904 break; 1905 case SIAtomicScope::WAVEFRONT: 1906 case SIAtomicScope::SINGLETHREAD: 1907 // The L0 cache keeps all memory operations in order for 1908 // work-items in the same wavefront. 1909 break; 1910 default: 1911 llvm_unreachable("Unsupported synchronization scope"); 1912 } 1913 } 1914 1915 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1916 switch (Scope) { 1917 case SIAtomicScope::SYSTEM: 1918 case SIAtomicScope::AGENT: 1919 case SIAtomicScope::WORKGROUP: 1920 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1921 // not needed as LDS operations for all waves are executed in a total 1922 // global ordering as observed by all waves. Required if also 1923 // synchronizing with global/GDS memory as LDS operations could be 1924 // reordered with respect to later global/GDS memory operations of the 1925 // same wave. 1926 LGKMCnt |= IsCrossAddrSpaceOrdering; 1927 break; 1928 case SIAtomicScope::WAVEFRONT: 1929 case SIAtomicScope::SINGLETHREAD: 1930 // The LDS keeps all memory operations in order for 1931 // the same wavefront. 1932 break; 1933 default: 1934 llvm_unreachable("Unsupported synchronization scope"); 1935 } 1936 } 1937 1938 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1939 switch (Scope) { 1940 case SIAtomicScope::SYSTEM: 1941 case SIAtomicScope::AGENT: 1942 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1943 // is not needed as GDS operations for all waves are executed in a total 1944 // global ordering as observed by all waves. Required if also 1945 // synchronizing with global/LDS memory as GDS operations could be 1946 // reordered with respect to later global/LDS memory operations of the 1947 // same wave. 1948 LGKMCnt |= IsCrossAddrSpaceOrdering; 1949 break; 1950 case SIAtomicScope::WORKGROUP: 1951 case SIAtomicScope::WAVEFRONT: 1952 case SIAtomicScope::SINGLETHREAD: 1953 // The GDS keeps all memory operations in order for 1954 // the same work-group. 1955 break; 1956 default: 1957 llvm_unreachable("Unsupported synchronization scope"); 1958 } 1959 } 1960 1961 if (VMCnt || LGKMCnt) { 1962 unsigned WaitCntImmediate = 1963 AMDGPU::encodeWaitcnt(IV, 1964 VMCnt ? 0 : getVmcntBitMask(IV), 1965 getExpcntBitMask(IV), 1966 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1967 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1968 .addImm(WaitCntImmediate); 1969 Changed = true; 1970 } 1971 1972 if (VSCnt) { 1973 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 1974 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1975 .addImm(0); 1976 Changed = true; 1977 } 1978 1979 if (Pos == Position::AFTER) 1980 --MI; 1981 1982 return Changed; 1983 } 1984 1985 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1986 SIAtomicScope Scope, 1987 SIAtomicAddrSpace AddrSpace, 1988 Position Pos) const { 1989 if (!InsertCacheInv) 1990 return false; 1991 1992 bool Changed = false; 1993 1994 MachineBasicBlock &MBB = *MI->getParent(); 1995 DebugLoc DL = MI->getDebugLoc(); 1996 1997 if (Pos == Position::AFTER) 1998 ++MI; 1999 2000 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2001 switch (Scope) { 2002 case SIAtomicScope::SYSTEM: 2003 case SIAtomicScope::AGENT: 2004 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2005 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2006 Changed = true; 2007 break; 2008 case SIAtomicScope::WORKGROUP: 2009 // In WGP mode the waves of a work-group can be executing on either CU of 2010 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2011 // in CU mode and all waves of a work-group are on the same CU, and so the 2012 // L0 does not need to be invalidated. 2013 if (!ST.isCuModeEnabled()) { 2014 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2015 Changed = true; 2016 } 2017 break; 2018 case SIAtomicScope::WAVEFRONT: 2019 case SIAtomicScope::SINGLETHREAD: 2020 // No cache to invalidate. 2021 break; 2022 default: 2023 llvm_unreachable("Unsupported synchronization scope"); 2024 } 2025 } 2026 2027 /// The scratch address space does not need the global memory cache 2028 /// to be flushed as all memory operations by the same thread are 2029 /// sequentially consistent, and no other thread can access scratch 2030 /// memory. 2031 2032 /// Other address spaces do not have a cache. 2033 2034 if (Pos == Position::AFTER) 2035 --MI; 2036 2037 return Changed; 2038 } 2039 2040 bool SIGfx11CacheControl::enableLoadCacheBypass( 2041 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2042 SIAtomicAddrSpace AddrSpace) const { 2043 assert(MI->mayLoad() && !MI->mayStore()); 2044 bool Changed = false; 2045 2046 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2047 switch (Scope) { 2048 case SIAtomicScope::SYSTEM: 2049 case SIAtomicScope::AGENT: 2050 // Set the L0 and L1 cache policies to MISS_EVICT. 2051 // Note: there is no L2 cache coherent bypass control at the ISA level. 2052 Changed |= enableGLCBit(MI); 2053 break; 2054 case SIAtomicScope::WORKGROUP: 2055 // In WGP mode the waves of a work-group can be executing on either CU of 2056 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2057 // CU mode all waves of a work-group are on the same CU, and so the L0 2058 // does not need to be bypassed. 2059 if (!ST.isCuModeEnabled()) 2060 Changed |= enableGLCBit(MI); 2061 break; 2062 case SIAtomicScope::WAVEFRONT: 2063 case SIAtomicScope::SINGLETHREAD: 2064 // No cache to bypass. 2065 break; 2066 default: 2067 llvm_unreachable("Unsupported synchronization scope"); 2068 } 2069 } 2070 2071 /// The scratch address space does not need the global memory caches 2072 /// to be bypassed as all memory operations by the same thread are 2073 /// sequentially consistent, and no other thread can access scratch 2074 /// memory. 2075 2076 /// Other address spaces do not have a cache. 2077 2078 return Changed; 2079 } 2080 2081 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2082 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2083 bool IsVolatile, bool IsNonTemporal) const { 2084 2085 // Only handle load and store, not atomic read-modify-write insructions. The 2086 // latter use glc to indicate if the atomic returns a result and so must not 2087 // be used for cache control. 2088 assert(MI->mayLoad() ^ MI->mayStore()); 2089 2090 // Only update load and store, not LLVM IR atomic read-modify-write 2091 // instructions. The latter are always marked as volatile so cannot sensibly 2092 // handle it as do not want to pessimize all atomics. Also they do not support 2093 // the nontemporal attribute. 2094 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2095 2096 bool Changed = false; 2097 2098 if (IsVolatile) { 2099 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2100 // and MISS_LRU for store instructions. 2101 // Note: there is no L2 cache coherent bypass control at the ISA level. 2102 if (Op == SIMemOp::LOAD) 2103 Changed |= enableGLCBit(MI); 2104 2105 // Set MALL NOALLOC for load and store instructions. 2106 Changed |= enableDLCBit(MI); 2107 2108 // Ensure operation has completed at system scope to cause all volatile 2109 // operations to be visible outside the program in a global order. Do not 2110 // request cross address space as only the global address space can be 2111 // observable outside the program, so no need to cause a waitcnt for LDS 2112 // address space operations. 2113 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2114 Position::AFTER); 2115 return Changed; 2116 } 2117 2118 if (IsNonTemporal) { 2119 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2120 // and L2 cache policy to STREAM. 2121 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2122 // to MISS_EVICT and the L2 cache policy to STREAM. 2123 if (Op == SIMemOp::STORE) 2124 Changed |= enableGLCBit(MI); 2125 Changed |= enableSLCBit(MI); 2126 2127 // Set MALL NOALLOC for load and store instructions. 2128 Changed |= enableDLCBit(MI); 2129 return Changed; 2130 } 2131 2132 return Changed; 2133 } 2134 2135 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2136 if (AtomicPseudoMIs.empty()) 2137 return false; 2138 2139 for (auto &MI : AtomicPseudoMIs) 2140 MI->eraseFromParent(); 2141 2142 AtomicPseudoMIs.clear(); 2143 return true; 2144 } 2145 2146 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2147 MachineBasicBlock::iterator &MI) { 2148 assert(MI->mayLoad() && !MI->mayStore()); 2149 2150 bool Changed = false; 2151 2152 if (MOI.isAtomic()) { 2153 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2154 MOI.getOrdering() == AtomicOrdering::Acquire || 2155 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2156 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2157 MOI.getOrderingAddrSpace()); 2158 } 2159 2160 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2161 Changed |= CC->insertWait(MI, MOI.getScope(), 2162 MOI.getOrderingAddrSpace(), 2163 SIMemOp::LOAD | SIMemOp::STORE, 2164 MOI.getIsCrossAddressSpaceOrdering(), 2165 Position::BEFORE); 2166 2167 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2168 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2169 Changed |= CC->insertWait(MI, MOI.getScope(), 2170 MOI.getInstrAddrSpace(), 2171 SIMemOp::LOAD, 2172 MOI.getIsCrossAddressSpaceOrdering(), 2173 Position::AFTER); 2174 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2175 MOI.getOrderingAddrSpace(), 2176 Position::AFTER); 2177 } 2178 2179 return Changed; 2180 } 2181 2182 // Atomic instructions already bypass caches to the scope specified by the 2183 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2184 // need additional treatment. 2185 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2186 SIMemOp::LOAD, MOI.isVolatile(), 2187 MOI.isNonTemporal()); 2188 return Changed; 2189 } 2190 2191 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2192 MachineBasicBlock::iterator &MI) { 2193 assert(!MI->mayLoad() && MI->mayStore()); 2194 2195 bool Changed = false; 2196 2197 if (MOI.isAtomic()) { 2198 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2199 MOI.getOrdering() == AtomicOrdering::Release || 2200 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2201 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2202 MOI.getOrderingAddrSpace()); 2203 } 2204 2205 if (MOI.getOrdering() == AtomicOrdering::Release || 2206 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2207 Changed |= CC->insertRelease(MI, MOI.getScope(), 2208 MOI.getOrderingAddrSpace(), 2209 MOI.getIsCrossAddressSpaceOrdering(), 2210 Position::BEFORE); 2211 2212 return Changed; 2213 } 2214 2215 // Atomic instructions already bypass caches to the scope specified by the 2216 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2217 // need additional treatment. 2218 Changed |= CC->enableVolatileAndOrNonTemporal( 2219 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2220 MOI.isNonTemporal()); 2221 return Changed; 2222 } 2223 2224 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2225 MachineBasicBlock::iterator &MI) { 2226 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2227 2228 AtomicPseudoMIs.push_back(MI); 2229 bool Changed = false; 2230 2231 if (MOI.isAtomic()) { 2232 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2233 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2234 SIMemOp::LOAD | SIMemOp::STORE, 2235 MOI.getIsCrossAddressSpaceOrdering(), 2236 Position::BEFORE); 2237 2238 if (MOI.getOrdering() == AtomicOrdering::Release || 2239 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2240 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2241 /// TODO: This relies on a barrier always generating a waitcnt 2242 /// for LDS to ensure it is not reordered with the completion of 2243 /// the proceeding LDS operations. If barrier had a memory 2244 /// ordering and memory scope, then library does not need to 2245 /// generate a fence. Could add support in this file for 2246 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2247 /// adding S_WAITCNT before a S_BARRIER. 2248 Changed |= CC->insertRelease(MI, MOI.getScope(), 2249 MOI.getOrderingAddrSpace(), 2250 MOI.getIsCrossAddressSpaceOrdering(), 2251 Position::BEFORE); 2252 2253 // TODO: If both release and invalidate are happening they could be combined 2254 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2255 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2256 // track cache invalidate and write back instructions. 2257 2258 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2259 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2260 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2261 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2262 MOI.getOrderingAddrSpace(), 2263 Position::BEFORE); 2264 2265 return Changed; 2266 } 2267 2268 return Changed; 2269 } 2270 2271 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2272 MachineBasicBlock::iterator &MI) { 2273 assert(MI->mayLoad() && MI->mayStore()); 2274 2275 bool Changed = false; 2276 2277 if (MOI.isAtomic()) { 2278 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2279 MOI.getOrdering() == AtomicOrdering::Acquire || 2280 MOI.getOrdering() == AtomicOrdering::Release || 2281 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2282 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2283 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2284 MOI.getInstrAddrSpace()); 2285 } 2286 2287 if (MOI.getOrdering() == AtomicOrdering::Release || 2288 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2289 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2290 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2291 Changed |= CC->insertRelease(MI, MOI.getScope(), 2292 MOI.getOrderingAddrSpace(), 2293 MOI.getIsCrossAddressSpaceOrdering(), 2294 Position::BEFORE); 2295 2296 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2297 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2298 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2299 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2300 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2301 Changed |= CC->insertWait(MI, MOI.getScope(), 2302 MOI.getInstrAddrSpace(), 2303 isAtomicRet(*MI) ? SIMemOp::LOAD : 2304 SIMemOp::STORE, 2305 MOI.getIsCrossAddressSpaceOrdering(), 2306 Position::AFTER); 2307 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2308 MOI.getOrderingAddrSpace(), 2309 Position::AFTER); 2310 } 2311 2312 return Changed; 2313 } 2314 2315 return Changed; 2316 } 2317 2318 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2319 bool Changed = false; 2320 2321 SIMemOpAccess MOA(MF); 2322 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2323 2324 for (auto &MBB : MF) { 2325 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2326 2327 // Unbundle instructions after the post-RA scheduler. 2328 if (MI->isBundle() && MI->mayLoadOrStore()) { 2329 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2330 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2331 I != E && I->isBundledWithPred(); ++I) { 2332 I->unbundleFromPred(); 2333 for (MachineOperand &MO : I->operands()) 2334 if (MO.isReg()) 2335 MO.setIsInternalRead(false); 2336 } 2337 2338 MI->eraseFromParent(); 2339 MI = II->getIterator(); 2340 } 2341 2342 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2343 continue; 2344 2345 if (const auto &MOI = MOA.getLoadInfo(MI)) 2346 Changed |= expandLoad(*MOI, MI); 2347 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2348 Changed |= expandStore(*MOI, MI); 2349 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2350 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2351 Changed |= expandAtomicFence(*MOI, MI); 2352 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2353 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2354 } 2355 } 2356 2357 Changed |= removeAtomicPseudoMIs(); 2358 return Changed; 2359 } 2360 2361 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2362 2363 char SIMemoryLegalizer::ID = 0; 2364 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2365 2366 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2367 return new SIMemoryLegalizer(); 2368 } 2369