1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/Support/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 Optional<SIMemOpInfo> constructFromMIWithMMO( 233 const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 Optional<SIMemOpInfo> getLoadInfo( 242 const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 Optional<SIMemOpInfo> getStoreInfo( 247 const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 Optional<SIMemOpInfo> getAtomicFenceInfo( 252 const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 257 const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 }; 355 356 class SIGfx6CacheControl : public SICacheControl { 357 protected: 358 359 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 360 /// is modified, false otherwise. 361 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 362 return enableNamedBit(MI, AMDGPU::CPol::GLC); 363 } 364 365 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 366 /// is modified, false otherwise. 367 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 368 return enableNamedBit(MI, AMDGPU::CPol::SLC); 369 } 370 371 public: 372 373 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 374 375 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 376 SIAtomicScope Scope, 377 SIAtomicAddrSpace AddrSpace) const override; 378 379 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 388 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 389 bool IsVolatile, 390 bool IsNonTemporal) const override; 391 392 bool insertWait(MachineBasicBlock::iterator &MI, 393 SIAtomicScope Scope, 394 SIAtomicAddrSpace AddrSpace, 395 SIMemOp Op, 396 bool IsCrossAddrSpaceOrdering, 397 Position Pos) const override; 398 399 bool insertAcquire(MachineBasicBlock::iterator &MI, 400 SIAtomicScope Scope, 401 SIAtomicAddrSpace AddrSpace, 402 Position Pos) const override; 403 404 bool insertRelease(MachineBasicBlock::iterator &MI, 405 SIAtomicScope Scope, 406 SIAtomicAddrSpace AddrSpace, 407 bool IsCrossAddrSpaceOrdering, 408 Position Pos) const override; 409 }; 410 411 class SIGfx7CacheControl : public SIGfx6CacheControl { 412 public: 413 414 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 415 416 bool insertAcquire(MachineBasicBlock::iterator &MI, 417 SIAtomicScope Scope, 418 SIAtomicAddrSpace AddrSpace, 419 Position Pos) const override; 420 421 }; 422 423 class SIGfx90ACacheControl : public SIGfx7CacheControl { 424 public: 425 426 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 427 428 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 429 SIAtomicScope Scope, 430 SIAtomicAddrSpace AddrSpace) const override; 431 432 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 441 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 442 bool IsVolatile, 443 bool IsNonTemporal) const override; 444 445 bool insertWait(MachineBasicBlock::iterator &MI, 446 SIAtomicScope Scope, 447 SIAtomicAddrSpace AddrSpace, 448 SIMemOp Op, 449 bool IsCrossAddrSpaceOrdering, 450 Position Pos) const override; 451 452 bool insertAcquire(MachineBasicBlock::iterator &MI, 453 SIAtomicScope Scope, 454 SIAtomicAddrSpace AddrSpace, 455 Position Pos) const override; 456 457 bool insertRelease(MachineBasicBlock::iterator &MI, 458 SIAtomicScope Scope, 459 SIAtomicAddrSpace AddrSpace, 460 bool IsCrossAddrSpaceOrdering, 461 Position Pos) const override; 462 }; 463 464 class SIGfx940CacheControl : public SIGfx90ACacheControl { 465 protected: 466 467 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 468 /// is modified, false otherwise. 469 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 470 return enableNamedBit(MI, AMDGPU::CPol::SC0); 471 } 472 473 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 474 /// is modified, false otherwise. 475 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 476 return enableNamedBit(MI, AMDGPU::CPol::SC1); 477 } 478 479 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 480 /// is modified, false otherwise. 481 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 482 return enableNamedBit(MI, AMDGPU::CPol::NT); 483 } 484 485 public: 486 487 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 488 489 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 490 SIAtomicScope Scope, 491 SIAtomicAddrSpace AddrSpace) const override; 492 493 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 502 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 503 bool IsVolatile, 504 bool IsNonTemporal) const override; 505 506 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 507 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 508 509 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 510 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 511 Position Pos) const override; 512 }; 513 514 class SIGfx10CacheControl : public SIGfx7CacheControl { 515 protected: 516 517 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 518 /// is modified, false otherwise. 519 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 520 return enableNamedBit(MI, AMDGPU::CPol::DLC); 521 } 522 523 public: 524 525 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 526 527 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 528 SIAtomicScope Scope, 529 SIAtomicAddrSpace AddrSpace) const override; 530 531 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 532 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 533 bool IsVolatile, 534 bool IsNonTemporal) const override; 535 536 bool insertWait(MachineBasicBlock::iterator &MI, 537 SIAtomicScope Scope, 538 SIAtomicAddrSpace AddrSpace, 539 SIMemOp Op, 540 bool IsCrossAddrSpaceOrdering, 541 Position Pos) const override; 542 543 bool insertAcquire(MachineBasicBlock::iterator &MI, 544 SIAtomicScope Scope, 545 SIAtomicAddrSpace AddrSpace, 546 Position Pos) const override; 547 }; 548 549 class SIGfx11CacheControl : public SIGfx10CacheControl { 550 public: 551 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 552 553 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 554 SIAtomicScope Scope, 555 SIAtomicAddrSpace AddrSpace) const override; 556 557 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 558 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 559 bool IsVolatile, 560 bool IsNonTemporal) const override; 561 }; 562 563 class SIMemoryLegalizer final : public MachineFunctionPass { 564 private: 565 566 /// Cache Control. 567 std::unique_ptr<SICacheControl> CC = nullptr; 568 569 /// List of atomic pseudo instructions. 570 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 571 572 /// Return true iff instruction \p MI is a atomic instruction that 573 /// returns a result. 574 bool isAtomicRet(const MachineInstr &MI) const { 575 return SIInstrInfo::isAtomicRet(MI); 576 } 577 578 /// Removes all processed atomic pseudo instructions from the current 579 /// function. Returns true if current function is modified, false otherwise. 580 bool removeAtomicPseudoMIs(); 581 582 /// Expands load operation \p MI. Returns true if instructions are 583 /// added/deleted or \p MI is modified, false otherwise. 584 bool expandLoad(const SIMemOpInfo &MOI, 585 MachineBasicBlock::iterator &MI); 586 /// Expands store operation \p MI. Returns true if instructions are 587 /// added/deleted or \p MI is modified, false otherwise. 588 bool expandStore(const SIMemOpInfo &MOI, 589 MachineBasicBlock::iterator &MI); 590 /// Expands atomic fence operation \p MI. Returns true if 591 /// instructions are added/deleted or \p MI is modified, false otherwise. 592 bool expandAtomicFence(const SIMemOpInfo &MOI, 593 MachineBasicBlock::iterator &MI); 594 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 595 /// instructions are added/deleted or \p MI is modified, false otherwise. 596 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 597 MachineBasicBlock::iterator &MI); 598 599 public: 600 static char ID; 601 602 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 603 604 void getAnalysisUsage(AnalysisUsage &AU) const override { 605 AU.setPreservesCFG(); 606 MachineFunctionPass::getAnalysisUsage(AU); 607 } 608 609 StringRef getPassName() const override { 610 return PASS_NAME; 611 } 612 613 bool runOnMachineFunction(MachineFunction &MF) override; 614 }; 615 616 } // end namespace anonymous 617 618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 619 const char *Msg) const { 620 const Function &Func = MI->getParent()->getParent()->getFunction(); 621 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 622 Func.getContext().diagnose(Diag); 623 } 624 625 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 627 SIAtomicAddrSpace InstrAddrSpace) const { 628 if (SSID == SyncScope::System) 629 return std::make_tuple(SIAtomicScope::SYSTEM, 630 SIAtomicAddrSpace::ATOMIC, 631 true); 632 if (SSID == MMI->getAgentSSID()) 633 return std::make_tuple(SIAtomicScope::AGENT, 634 SIAtomicAddrSpace::ATOMIC, 635 true); 636 if (SSID == MMI->getWorkgroupSSID()) 637 return std::make_tuple(SIAtomicScope::WORKGROUP, 638 SIAtomicAddrSpace::ATOMIC, 639 true); 640 if (SSID == MMI->getWavefrontSSID()) 641 return std::make_tuple(SIAtomicScope::WAVEFRONT, 642 SIAtomicAddrSpace::ATOMIC, 643 true); 644 if (SSID == SyncScope::SingleThread) 645 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 646 SIAtomicAddrSpace::ATOMIC, 647 true); 648 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 649 return std::make_tuple(SIAtomicScope::SYSTEM, 650 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 651 false); 652 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 653 return std::make_tuple(SIAtomicScope::AGENT, 654 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 655 false); 656 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 657 return std::make_tuple(SIAtomicScope::WORKGROUP, 658 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 659 false); 660 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 661 return std::make_tuple(SIAtomicScope::WAVEFRONT, 662 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 663 false); 664 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 665 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 666 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 667 false); 668 return std::nullopt; 669 } 670 671 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 672 if (AS == AMDGPUAS::FLAT_ADDRESS) 673 return SIAtomicAddrSpace::FLAT; 674 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 675 return SIAtomicAddrSpace::GLOBAL; 676 if (AS == AMDGPUAS::LOCAL_ADDRESS) 677 return SIAtomicAddrSpace::LDS; 678 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 679 return SIAtomicAddrSpace::SCRATCH; 680 if (AS == AMDGPUAS::REGION_ADDRESS) 681 return SIAtomicAddrSpace::GDS; 682 683 return SIAtomicAddrSpace::OTHER; 684 } 685 686 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 687 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 688 } 689 690 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 691 const MachineBasicBlock::iterator &MI) const { 692 assert(MI->getNumMemOperands() > 0); 693 694 SyncScope::ID SSID = SyncScope::SingleThread; 695 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 696 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 697 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 698 bool IsNonTemporal = true; 699 bool IsVolatile = false; 700 701 // Validator should check whether or not MMOs cover the entire set of 702 // locations accessed by the memory instruction. 703 for (const auto &MMO : MI->memoperands()) { 704 IsNonTemporal &= MMO->isNonTemporal(); 705 IsVolatile |= MMO->isVolatile(); 706 InstrAddrSpace |= 707 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 708 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 709 if (OpOrdering != AtomicOrdering::NotAtomic) { 710 const auto &IsSyncScopeInclusion = 711 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 712 if (!IsSyncScopeInclusion) { 713 reportUnsupported(MI, 714 "Unsupported non-inclusive atomic synchronization scope"); 715 return std::nullopt; 716 } 717 718 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 719 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 720 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 721 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 722 FailureOrdering = 723 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 724 } 725 } 726 727 SIAtomicScope Scope = SIAtomicScope::NONE; 728 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 729 bool IsCrossAddressSpaceOrdering = false; 730 if (Ordering != AtomicOrdering::NotAtomic) { 731 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 732 if (!ScopeOrNone) { 733 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 734 return std::nullopt; 735 } 736 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 737 *ScopeOrNone; 738 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 739 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 740 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 741 reportUnsupported(MI, "Unsupported atomic address space"); 742 return std::nullopt; 743 } 744 } 745 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 746 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 747 IsNonTemporal); 748 } 749 750 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 751 const MachineBasicBlock::iterator &MI) const { 752 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 753 754 if (!(MI->mayLoad() && !MI->mayStore())) 755 return std::nullopt; 756 757 // Be conservative if there are no memory operands. 758 if (MI->getNumMemOperands() == 0) 759 return SIMemOpInfo(); 760 761 return constructFromMIWithMMO(MI); 762 } 763 764 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 765 const MachineBasicBlock::iterator &MI) const { 766 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 767 768 if (!(!MI->mayLoad() && MI->mayStore())) 769 return std::nullopt; 770 771 // Be conservative if there are no memory operands. 772 if (MI->getNumMemOperands() == 0) 773 return SIMemOpInfo(); 774 775 return constructFromMIWithMMO(MI); 776 } 777 778 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 779 const MachineBasicBlock::iterator &MI) const { 780 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 781 782 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 783 return std::nullopt; 784 785 AtomicOrdering Ordering = 786 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 787 788 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 789 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 790 if (!ScopeOrNone) { 791 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 792 return std::nullopt; 793 } 794 795 SIAtomicScope Scope = SIAtomicScope::NONE; 796 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 797 bool IsCrossAddressSpaceOrdering = false; 798 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 799 *ScopeOrNone; 800 801 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 802 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 803 reportUnsupported(MI, "Unsupported atomic address space"); 804 return std::nullopt; 805 } 806 807 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 808 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 809 } 810 811 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 812 const MachineBasicBlock::iterator &MI) const { 813 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 814 815 if (!(MI->mayLoad() && MI->mayStore())) 816 return std::nullopt; 817 818 // Be conservative if there are no memory operands. 819 if (MI->getNumMemOperands() == 0) 820 return SIMemOpInfo(); 821 822 return constructFromMIWithMMO(MI); 823 } 824 825 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 826 TII = ST.getInstrInfo(); 827 IV = getIsaVersion(ST.getCPU()); 828 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 829 } 830 831 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 832 AMDGPU::CPol::CPol Bit) const { 833 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 834 if (!CPol) 835 return false; 836 837 CPol->setImm(CPol->getImm() | Bit); 838 return true; 839 } 840 841 /* static */ 842 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 843 GCNSubtarget::Generation Generation = ST.getGeneration(); 844 if (ST.hasGFX940Insts()) 845 return std::make_unique<SIGfx940CacheControl>(ST); 846 if (ST.hasGFX90AInsts()) 847 return std::make_unique<SIGfx90ACacheControl>(ST); 848 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 849 return std::make_unique<SIGfx6CacheControl>(ST); 850 if (Generation < AMDGPUSubtarget::GFX10) 851 return std::make_unique<SIGfx7CacheControl>(ST); 852 if (Generation < AMDGPUSubtarget::GFX11) 853 return std::make_unique<SIGfx10CacheControl>(ST); 854 return std::make_unique<SIGfx11CacheControl>(ST); 855 } 856 857 bool SIGfx6CacheControl::enableLoadCacheBypass( 858 const MachineBasicBlock::iterator &MI, 859 SIAtomicScope Scope, 860 SIAtomicAddrSpace AddrSpace) const { 861 assert(MI->mayLoad() && !MI->mayStore()); 862 bool Changed = false; 863 864 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 865 switch (Scope) { 866 case SIAtomicScope::SYSTEM: 867 case SIAtomicScope::AGENT: 868 // Set L1 cache policy to MISS_EVICT. 869 // Note: there is no L2 cache bypass policy at the ISA level. 870 Changed |= enableGLCBit(MI); 871 break; 872 case SIAtomicScope::WORKGROUP: 873 case SIAtomicScope::WAVEFRONT: 874 case SIAtomicScope::SINGLETHREAD: 875 // No cache to bypass. 876 break; 877 default: 878 llvm_unreachable("Unsupported synchronization scope"); 879 } 880 } 881 882 /// The scratch address space does not need the global memory caches 883 /// to be bypassed as all memory operations by the same thread are 884 /// sequentially consistent, and no other thread can access scratch 885 /// memory. 886 887 /// Other address spaces do not have a cache. 888 889 return Changed; 890 } 891 892 bool SIGfx6CacheControl::enableStoreCacheBypass( 893 const MachineBasicBlock::iterator &MI, 894 SIAtomicScope Scope, 895 SIAtomicAddrSpace AddrSpace) const { 896 assert(!MI->mayLoad() && MI->mayStore()); 897 bool Changed = false; 898 899 /// The L1 cache is write through so does not need to be bypassed. There is no 900 /// bypass control for the L2 cache at the isa level. 901 902 return Changed; 903 } 904 905 bool SIGfx6CacheControl::enableRMWCacheBypass( 906 const MachineBasicBlock::iterator &MI, 907 SIAtomicScope Scope, 908 SIAtomicAddrSpace AddrSpace) const { 909 assert(MI->mayLoad() && MI->mayStore()); 910 bool Changed = false; 911 912 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 913 /// bypassed, and the GLC bit is instead used to indicate if they are 914 /// return or no-return. 915 /// Note: there is no L2 cache coherent bypass control at the ISA level. 916 917 return Changed; 918 } 919 920 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 921 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 922 bool IsVolatile, bool IsNonTemporal) const { 923 // Only handle load and store, not atomic read-modify-write insructions. The 924 // latter use glc to indicate if the atomic returns a result and so must not 925 // be used for cache control. 926 assert(MI->mayLoad() ^ MI->mayStore()); 927 928 // Only update load and store, not LLVM IR atomic read-modify-write 929 // instructions. The latter are always marked as volatile so cannot sensibly 930 // handle it as do not want to pessimize all atomics. Also they do not support 931 // the nontemporal attribute. 932 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 933 934 bool Changed = false; 935 936 if (IsVolatile) { 937 // Set L1 cache policy to be MISS_EVICT for load instructions 938 // and MISS_LRU for store instructions. 939 // Note: there is no L2 cache bypass policy at the ISA level. 940 if (Op == SIMemOp::LOAD) 941 Changed |= enableGLCBit(MI); 942 943 // Ensure operation has completed at system scope to cause all volatile 944 // operations to be visible outside the program in a global order. Do not 945 // request cross address space as only the global address space can be 946 // observable outside the program, so no need to cause a waitcnt for LDS 947 // address space operations. 948 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 949 Position::AFTER); 950 951 return Changed; 952 } 953 954 if (IsNonTemporal) { 955 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 956 // for both loads and stores, and the L2 cache policy to STREAM. 957 Changed |= enableGLCBit(MI); 958 Changed |= enableSLCBit(MI); 959 return Changed; 960 } 961 962 return Changed; 963 } 964 965 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 966 SIAtomicScope Scope, 967 SIAtomicAddrSpace AddrSpace, 968 SIMemOp Op, 969 bool IsCrossAddrSpaceOrdering, 970 Position Pos) const { 971 bool Changed = false; 972 973 MachineBasicBlock &MBB = *MI->getParent(); 974 DebugLoc DL = MI->getDebugLoc(); 975 976 if (Pos == Position::AFTER) 977 ++MI; 978 979 bool VMCnt = false; 980 bool LGKMCnt = false; 981 982 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 983 SIAtomicAddrSpace::NONE) { 984 switch (Scope) { 985 case SIAtomicScope::SYSTEM: 986 case SIAtomicScope::AGENT: 987 VMCnt |= true; 988 break; 989 case SIAtomicScope::WORKGROUP: 990 case SIAtomicScope::WAVEFRONT: 991 case SIAtomicScope::SINGLETHREAD: 992 // The L1 cache keeps all memory operations in order for 993 // wavefronts in the same work-group. 994 break; 995 default: 996 llvm_unreachable("Unsupported synchronization scope"); 997 } 998 } 999 1000 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1001 switch (Scope) { 1002 case SIAtomicScope::SYSTEM: 1003 case SIAtomicScope::AGENT: 1004 case SIAtomicScope::WORKGROUP: 1005 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1006 // not needed as LDS operations for all waves are executed in a total 1007 // global ordering as observed by all waves. Required if also 1008 // synchronizing with global/GDS memory as LDS operations could be 1009 // reordered with respect to later global/GDS memory operations of the 1010 // same wave. 1011 LGKMCnt |= IsCrossAddrSpaceOrdering; 1012 break; 1013 case SIAtomicScope::WAVEFRONT: 1014 case SIAtomicScope::SINGLETHREAD: 1015 // The LDS keeps all memory operations in order for 1016 // the same wavefront. 1017 break; 1018 default: 1019 llvm_unreachable("Unsupported synchronization scope"); 1020 } 1021 } 1022 1023 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1024 switch (Scope) { 1025 case SIAtomicScope::SYSTEM: 1026 case SIAtomicScope::AGENT: 1027 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1028 // is not needed as GDS operations for all waves are executed in a total 1029 // global ordering as observed by all waves. Required if also 1030 // synchronizing with global/LDS memory as GDS operations could be 1031 // reordered with respect to later global/LDS memory operations of the 1032 // same wave. 1033 LGKMCnt |= IsCrossAddrSpaceOrdering; 1034 break; 1035 case SIAtomicScope::WORKGROUP: 1036 case SIAtomicScope::WAVEFRONT: 1037 case SIAtomicScope::SINGLETHREAD: 1038 // The GDS keeps all memory operations in order for 1039 // the same work-group. 1040 break; 1041 default: 1042 llvm_unreachable("Unsupported synchronization scope"); 1043 } 1044 } 1045 1046 if (VMCnt || LGKMCnt) { 1047 unsigned WaitCntImmediate = 1048 AMDGPU::encodeWaitcnt(IV, 1049 VMCnt ? 0 : getVmcntBitMask(IV), 1050 getExpcntBitMask(IV), 1051 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1052 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1053 Changed = true; 1054 } 1055 1056 if (Pos == Position::AFTER) 1057 --MI; 1058 1059 return Changed; 1060 } 1061 1062 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1063 SIAtomicScope Scope, 1064 SIAtomicAddrSpace AddrSpace, 1065 Position Pos) const { 1066 if (!InsertCacheInv) 1067 return false; 1068 1069 bool Changed = false; 1070 1071 MachineBasicBlock &MBB = *MI->getParent(); 1072 DebugLoc DL = MI->getDebugLoc(); 1073 1074 if (Pos == Position::AFTER) 1075 ++MI; 1076 1077 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1078 switch (Scope) { 1079 case SIAtomicScope::SYSTEM: 1080 case SIAtomicScope::AGENT: 1081 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1082 Changed = true; 1083 break; 1084 case SIAtomicScope::WORKGROUP: 1085 case SIAtomicScope::WAVEFRONT: 1086 case SIAtomicScope::SINGLETHREAD: 1087 // No cache to invalidate. 1088 break; 1089 default: 1090 llvm_unreachable("Unsupported synchronization scope"); 1091 } 1092 } 1093 1094 /// The scratch address space does not need the global memory cache 1095 /// to be flushed as all memory operations by the same thread are 1096 /// sequentially consistent, and no other thread can access scratch 1097 /// memory. 1098 1099 /// Other address spaces do not have a cache. 1100 1101 if (Pos == Position::AFTER) 1102 --MI; 1103 1104 return Changed; 1105 } 1106 1107 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1108 SIAtomicScope Scope, 1109 SIAtomicAddrSpace AddrSpace, 1110 bool IsCrossAddrSpaceOrdering, 1111 Position Pos) const { 1112 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1113 IsCrossAddrSpaceOrdering, Pos); 1114 } 1115 1116 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1117 SIAtomicScope Scope, 1118 SIAtomicAddrSpace AddrSpace, 1119 Position Pos) const { 1120 if (!InsertCacheInv) 1121 return false; 1122 1123 bool Changed = false; 1124 1125 MachineBasicBlock &MBB = *MI->getParent(); 1126 DebugLoc DL = MI->getDebugLoc(); 1127 1128 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1129 1130 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1131 ? AMDGPU::BUFFER_WBINVL1 1132 : AMDGPU::BUFFER_WBINVL1_VOL; 1133 1134 if (Pos == Position::AFTER) 1135 ++MI; 1136 1137 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1138 switch (Scope) { 1139 case SIAtomicScope::SYSTEM: 1140 case SIAtomicScope::AGENT: 1141 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1142 Changed = true; 1143 break; 1144 case SIAtomicScope::WORKGROUP: 1145 case SIAtomicScope::WAVEFRONT: 1146 case SIAtomicScope::SINGLETHREAD: 1147 // No cache to invalidate. 1148 break; 1149 default: 1150 llvm_unreachable("Unsupported synchronization scope"); 1151 } 1152 } 1153 1154 /// The scratch address space does not need the global memory cache 1155 /// to be flushed as all memory operations by the same thread are 1156 /// sequentially consistent, and no other thread can access scratch 1157 /// memory. 1158 1159 /// Other address spaces do not have a cache. 1160 1161 if (Pos == Position::AFTER) 1162 --MI; 1163 1164 return Changed; 1165 } 1166 1167 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1168 const MachineBasicBlock::iterator &MI, 1169 SIAtomicScope Scope, 1170 SIAtomicAddrSpace AddrSpace) const { 1171 assert(MI->mayLoad() && !MI->mayStore()); 1172 bool Changed = false; 1173 1174 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1175 switch (Scope) { 1176 case SIAtomicScope::SYSTEM: 1177 case SIAtomicScope::AGENT: 1178 // Set the L1 cache policy to MISS_LRU. 1179 // Note: there is no L2 cache bypass policy at the ISA level. 1180 Changed |= enableGLCBit(MI); 1181 break; 1182 case SIAtomicScope::WORKGROUP: 1183 // In threadgroup split mode the waves of a work-group can be executing on 1184 // different CUs. Therefore need to bypass the L1 which is per CU. 1185 // Otherwise in non-threadgroup split mode all waves of a work-group are 1186 // on the same CU, and so the L1 does not need to be bypassed. 1187 if (ST.isTgSplitEnabled()) 1188 Changed |= enableGLCBit(MI); 1189 break; 1190 case SIAtomicScope::WAVEFRONT: 1191 case SIAtomicScope::SINGLETHREAD: 1192 // No cache to bypass. 1193 break; 1194 default: 1195 llvm_unreachable("Unsupported synchronization scope"); 1196 } 1197 } 1198 1199 /// The scratch address space does not need the global memory caches 1200 /// to be bypassed as all memory operations by the same thread are 1201 /// sequentially consistent, and no other thread can access scratch 1202 /// memory. 1203 1204 /// Other address spaces do not have a cache. 1205 1206 return Changed; 1207 } 1208 1209 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1210 const MachineBasicBlock::iterator &MI, 1211 SIAtomicScope Scope, 1212 SIAtomicAddrSpace AddrSpace) const { 1213 assert(!MI->mayLoad() && MI->mayStore()); 1214 bool Changed = false; 1215 1216 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1217 switch (Scope) { 1218 case SIAtomicScope::SYSTEM: 1219 case SIAtomicScope::AGENT: 1220 /// Do not set glc for store atomic operations as they implicitly write 1221 /// through the L1 cache. 1222 break; 1223 case SIAtomicScope::WORKGROUP: 1224 case SIAtomicScope::WAVEFRONT: 1225 case SIAtomicScope::SINGLETHREAD: 1226 // No cache to bypass. Store atomics implicitly write through the L1 1227 // cache. 1228 break; 1229 default: 1230 llvm_unreachable("Unsupported synchronization scope"); 1231 } 1232 } 1233 1234 /// The scratch address space does not need the global memory caches 1235 /// to be bypassed as all memory operations by the same thread are 1236 /// sequentially consistent, and no other thread can access scratch 1237 /// memory. 1238 1239 /// Other address spaces do not have a cache. 1240 1241 return Changed; 1242 } 1243 1244 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1245 const MachineBasicBlock::iterator &MI, 1246 SIAtomicScope Scope, 1247 SIAtomicAddrSpace AddrSpace) const { 1248 assert(MI->mayLoad() && MI->mayStore()); 1249 bool Changed = false; 1250 1251 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1252 switch (Scope) { 1253 case SIAtomicScope::SYSTEM: 1254 case SIAtomicScope::AGENT: 1255 /// Do not set glc for RMW atomic operations as they implicitly bypass 1256 /// the L1 cache, and the glc bit is instead used to indicate if they are 1257 /// return or no-return. 1258 break; 1259 case SIAtomicScope::WORKGROUP: 1260 case SIAtomicScope::WAVEFRONT: 1261 case SIAtomicScope::SINGLETHREAD: 1262 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1263 break; 1264 default: 1265 llvm_unreachable("Unsupported synchronization scope"); 1266 } 1267 } 1268 1269 return Changed; 1270 } 1271 1272 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1273 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1274 bool IsVolatile, bool IsNonTemporal) const { 1275 // Only handle load and store, not atomic read-modify-write insructions. The 1276 // latter use glc to indicate if the atomic returns a result and so must not 1277 // be used for cache control. 1278 assert(MI->mayLoad() ^ MI->mayStore()); 1279 1280 // Only update load and store, not LLVM IR atomic read-modify-write 1281 // instructions. The latter are always marked as volatile so cannot sensibly 1282 // handle it as do not want to pessimize all atomics. Also they do not support 1283 // the nontemporal attribute. 1284 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1285 1286 bool Changed = false; 1287 1288 if (IsVolatile) { 1289 // Set L1 cache policy to be MISS_EVICT for load instructions 1290 // and MISS_LRU for store instructions. 1291 // Note: there is no L2 cache bypass policy at the ISA level. 1292 if (Op == SIMemOp::LOAD) 1293 Changed |= enableGLCBit(MI); 1294 1295 // Ensure operation has completed at system scope to cause all volatile 1296 // operations to be visible outside the program in a global order. Do not 1297 // request cross address space as only the global address space can be 1298 // observable outside the program, so no need to cause a waitcnt for LDS 1299 // address space operations. 1300 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1301 Position::AFTER); 1302 1303 return Changed; 1304 } 1305 1306 if (IsNonTemporal) { 1307 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1308 // for both loads and stores, and the L2 cache policy to STREAM. 1309 Changed |= enableGLCBit(MI); 1310 Changed |= enableSLCBit(MI); 1311 return Changed; 1312 } 1313 1314 return Changed; 1315 } 1316 1317 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1318 SIAtomicScope Scope, 1319 SIAtomicAddrSpace AddrSpace, 1320 SIMemOp Op, 1321 bool IsCrossAddrSpaceOrdering, 1322 Position Pos) const { 1323 if (ST.isTgSplitEnabled()) { 1324 // In threadgroup split mode the waves of a work-group can be executing on 1325 // different CUs. Therefore need to wait for global or GDS memory operations 1326 // to complete to ensure they are visible to waves in the other CUs. 1327 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1328 // the same CU, so no need to wait for global memory as all waves in the 1329 // work-group access the same the L1, nor wait for GDS as access are ordered 1330 // on a CU. 1331 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1332 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1333 (Scope == SIAtomicScope::WORKGROUP)) { 1334 // Same as GFX7 using agent scope. 1335 Scope = SIAtomicScope::AGENT; 1336 } 1337 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1338 // LDS memory operations. 1339 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1340 } 1341 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1342 IsCrossAddrSpaceOrdering, Pos); 1343 } 1344 1345 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1346 SIAtomicScope Scope, 1347 SIAtomicAddrSpace AddrSpace, 1348 Position Pos) const { 1349 if (!InsertCacheInv) 1350 return false; 1351 1352 bool Changed = false; 1353 1354 MachineBasicBlock &MBB = *MI->getParent(); 1355 DebugLoc DL = MI->getDebugLoc(); 1356 1357 if (Pos == Position::AFTER) 1358 ++MI; 1359 1360 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1361 switch (Scope) { 1362 case SIAtomicScope::SYSTEM: 1363 // Ensures that following loads will not see stale remote VMEM data or 1364 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1365 // CC will never be stale due to the local memory probes. 1366 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1367 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1368 // hardware does not reorder memory operations by the same wave with 1369 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1370 // remove any cache lines of earlier writes by the same wave and ensures 1371 // later reads by the same wave will refetch the cache lines. 1372 Changed = true; 1373 break; 1374 case SIAtomicScope::AGENT: 1375 // Same as GFX7. 1376 break; 1377 case SIAtomicScope::WORKGROUP: 1378 // In threadgroup split mode the waves of a work-group can be executing on 1379 // different CUs. Therefore need to invalidate the L1 which is per CU. 1380 // Otherwise in non-threadgroup split mode all waves of a work-group are 1381 // on the same CU, and so the L1 does not need to be invalidated. 1382 if (ST.isTgSplitEnabled()) { 1383 // Same as GFX7 using agent scope. 1384 Scope = SIAtomicScope::AGENT; 1385 } 1386 break; 1387 case SIAtomicScope::WAVEFRONT: 1388 case SIAtomicScope::SINGLETHREAD: 1389 // Same as GFX7. 1390 break; 1391 default: 1392 llvm_unreachable("Unsupported synchronization scope"); 1393 } 1394 } 1395 1396 /// The scratch address space does not need the global memory cache 1397 /// to be flushed as all memory operations by the same thread are 1398 /// sequentially consistent, and no other thread can access scratch 1399 /// memory. 1400 1401 /// Other address spaces do not have a cache. 1402 1403 if (Pos == Position::AFTER) 1404 --MI; 1405 1406 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1407 1408 return Changed; 1409 } 1410 1411 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1412 SIAtomicScope Scope, 1413 SIAtomicAddrSpace AddrSpace, 1414 bool IsCrossAddrSpaceOrdering, 1415 Position Pos) const { 1416 bool Changed = false; 1417 1418 MachineBasicBlock &MBB = *MI->getParent(); 1419 DebugLoc DL = MI->getDebugLoc(); 1420 1421 if (Pos == Position::AFTER) 1422 ++MI; 1423 1424 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1425 switch (Scope) { 1426 case SIAtomicScope::SYSTEM: 1427 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1428 // hardware does not reorder memory operations by the same wave with 1429 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1430 // to initiate writeback of any dirty cache lines of earlier writes by the 1431 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1432 // writeback has completed. 1433 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1434 // Set SC bits to indicate system scope. 1435 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1436 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1437 // vmcnt(0)" needed by the "BUFFER_WBL2". 1438 Changed = true; 1439 break; 1440 case SIAtomicScope::AGENT: 1441 case SIAtomicScope::WORKGROUP: 1442 case SIAtomicScope::WAVEFRONT: 1443 case SIAtomicScope::SINGLETHREAD: 1444 // Same as GFX7. 1445 break; 1446 default: 1447 llvm_unreachable("Unsupported synchronization scope"); 1448 } 1449 } 1450 1451 if (Pos == Position::AFTER) 1452 --MI; 1453 1454 Changed |= 1455 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1456 IsCrossAddrSpaceOrdering, Pos); 1457 1458 return Changed; 1459 } 1460 1461 bool SIGfx940CacheControl::enableLoadCacheBypass( 1462 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1463 SIAtomicAddrSpace AddrSpace) const { 1464 assert(MI->mayLoad() && !MI->mayStore()); 1465 bool Changed = false; 1466 1467 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1468 switch (Scope) { 1469 case SIAtomicScope::SYSTEM: 1470 // Set SC bits to indicate system scope. 1471 Changed |= enableSC0Bit(MI); 1472 Changed |= enableSC1Bit(MI); 1473 break; 1474 case SIAtomicScope::AGENT: 1475 // Set SC bits to indicate agent scope. 1476 Changed |= enableSC1Bit(MI); 1477 break; 1478 case SIAtomicScope::WORKGROUP: 1479 // In threadgroup split mode the waves of a work-group can be executing on 1480 // different CUs. Therefore need to bypass the L1 which is per CU. 1481 // Otherwise in non-threadgroup split mode all waves of a work-group are 1482 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1483 // bits to indicate work-group scope will do this automatically. 1484 Changed |= enableSC0Bit(MI); 1485 break; 1486 case SIAtomicScope::WAVEFRONT: 1487 case SIAtomicScope::SINGLETHREAD: 1488 // Leave SC bits unset to indicate wavefront scope. 1489 break; 1490 default: 1491 llvm_unreachable("Unsupported synchronization scope"); 1492 } 1493 } 1494 1495 /// The scratch address space does not need the global memory caches 1496 /// to be bypassed as all memory operations by the same thread are 1497 /// sequentially consistent, and no other thread can access scratch 1498 /// memory. 1499 1500 /// Other address spaces do not have a cache. 1501 1502 return Changed; 1503 } 1504 1505 bool SIGfx940CacheControl::enableStoreCacheBypass( 1506 const MachineBasicBlock::iterator &MI, 1507 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1508 assert(!MI->mayLoad() && MI->mayStore()); 1509 bool Changed = false; 1510 1511 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1512 switch (Scope) { 1513 case SIAtomicScope::SYSTEM: 1514 // Set SC bits to indicate system scope. 1515 Changed |= enableSC0Bit(MI); 1516 Changed |= enableSC1Bit(MI); 1517 break; 1518 case SIAtomicScope::AGENT: 1519 // Set SC bits to indicate agent scope. 1520 Changed |= enableSC1Bit(MI); 1521 break; 1522 case SIAtomicScope::WORKGROUP: 1523 // Set SC bits to indicate workgroup scope. 1524 Changed |= enableSC0Bit(MI); 1525 break; 1526 case SIAtomicScope::WAVEFRONT: 1527 case SIAtomicScope::SINGLETHREAD: 1528 // Leave SC bits unset to indicate wavefront scope. 1529 break; 1530 default: 1531 llvm_unreachable("Unsupported synchronization scope"); 1532 } 1533 } 1534 1535 /// The scratch address space does not need the global memory caches 1536 /// to be bypassed as all memory operations by the same thread are 1537 /// sequentially consistent, and no other thread can access scratch 1538 /// memory. 1539 1540 /// Other address spaces do not have a cache. 1541 1542 return Changed; 1543 } 1544 1545 bool SIGfx940CacheControl::enableRMWCacheBypass( 1546 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1547 SIAtomicAddrSpace AddrSpace) const { 1548 assert(MI->mayLoad() && MI->mayStore()); 1549 bool Changed = false; 1550 1551 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1552 switch (Scope) { 1553 case SIAtomicScope::SYSTEM: 1554 // Set SC1 bit to indicate system scope. 1555 Changed |= enableSC1Bit(MI); 1556 break; 1557 case SIAtomicScope::AGENT: 1558 case SIAtomicScope::WORKGROUP: 1559 case SIAtomicScope::WAVEFRONT: 1560 case SIAtomicScope::SINGLETHREAD: 1561 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1562 // to indicate system or agent scope. The SC0 bit is used to indicate if 1563 // they are return or no-return. Leave SC1 bit unset to indicate agent 1564 // scope. 1565 break; 1566 default: 1567 llvm_unreachable("Unsupported synchronization scope"); 1568 } 1569 } 1570 1571 return Changed; 1572 } 1573 1574 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1575 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1576 bool IsVolatile, bool IsNonTemporal) const { 1577 // Only handle load and store, not atomic read-modify-write insructions. The 1578 // latter use glc to indicate if the atomic returns a result and so must not 1579 // be used for cache control. 1580 assert(MI->mayLoad() ^ MI->mayStore()); 1581 1582 // Only update load and store, not LLVM IR atomic read-modify-write 1583 // instructions. The latter are always marked as volatile so cannot sensibly 1584 // handle it as do not want to pessimize all atomics. Also they do not support 1585 // the nontemporal attribute. 1586 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1587 1588 bool Changed = false; 1589 1590 if (IsVolatile) { 1591 // Set SC bits to indicate system scope. 1592 Changed |= enableSC0Bit(MI); 1593 Changed |= enableSC1Bit(MI); 1594 1595 // Ensure operation has completed at system scope to cause all volatile 1596 // operations to be visible outside the program in a global order. Do not 1597 // request cross address space as only the global address space can be 1598 // observable outside the program, so no need to cause a waitcnt for LDS 1599 // address space operations. 1600 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1601 Position::AFTER); 1602 1603 return Changed; 1604 } 1605 1606 if (IsNonTemporal) { 1607 Changed |= enableNTBit(MI); 1608 return Changed; 1609 } 1610 1611 return Changed; 1612 } 1613 1614 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1615 SIAtomicScope Scope, 1616 SIAtomicAddrSpace AddrSpace, 1617 Position Pos) const { 1618 if (!InsertCacheInv) 1619 return false; 1620 1621 bool Changed = false; 1622 1623 MachineBasicBlock &MBB = *MI->getParent(); 1624 DebugLoc DL = MI->getDebugLoc(); 1625 1626 if (Pos == Position::AFTER) 1627 ++MI; 1628 1629 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1630 switch (Scope) { 1631 case SIAtomicScope::SYSTEM: 1632 // Ensures that following loads will not see stale remote VMEM data or 1633 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1634 // CC will never be stale due to the local memory probes. 1635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1636 // Set SC bits to indicate system scope. 1637 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1638 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1639 // hardware does not reorder memory operations by the same wave with 1640 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1641 // remove any cache lines of earlier writes by the same wave and ensures 1642 // later reads by the same wave will refetch the cache lines. 1643 Changed = true; 1644 break; 1645 case SIAtomicScope::AGENT: 1646 // Ensures that following loads will not see stale remote date or local 1647 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1648 // due to the memory probes. 1649 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1650 // Set SC bits to indicate agent scope. 1651 .addImm(AMDGPU::CPol::SC1); 1652 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1653 // does not reorder memory operations with respect to preceeding buffer 1654 // invalidate. The invalidate is guaranteed to remove any cache lines of 1655 // earlier writes and ensures later writes will refetch the cache lines. 1656 Changed = true; 1657 break; 1658 case SIAtomicScope::WORKGROUP: 1659 // In threadgroup split mode the waves of a work-group can be executing on 1660 // different CUs. Therefore need to invalidate the L1 which is per CU. 1661 // Otherwise in non-threadgroup split mode all waves of a work-group are 1662 // on the same CU, and so the L1 does not need to be invalidated. 1663 if (ST.isTgSplitEnabled()) { 1664 // Ensures L1 is invalidated if in threadgroup split mode. In 1665 // non-threadgroup split mode it is a NOP, but no point generating it in 1666 // that case if know not in that mode. 1667 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1668 // Set SC bits to indicate work-group scope. 1669 .addImm(AMDGPU::CPol::SC0); 1670 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1671 // does not reorder memory operations with respect to preceeding buffer 1672 // invalidate. The invalidate is guaranteed to remove any cache lines of 1673 // earlier writes and ensures later writes will refetch the cache lines. 1674 Changed = true; 1675 } 1676 break; 1677 case SIAtomicScope::WAVEFRONT: 1678 case SIAtomicScope::SINGLETHREAD: 1679 // Could generate "BUFFER_INV" but it would do nothing as there are no 1680 // caches to invalidate. 1681 break; 1682 default: 1683 llvm_unreachable("Unsupported synchronization scope"); 1684 } 1685 } 1686 1687 /// The scratch address space does not need the global memory cache 1688 /// to be flushed as all memory operations by the same thread are 1689 /// sequentially consistent, and no other thread can access scratch 1690 /// memory. 1691 1692 /// Other address spaces do not have a cache. 1693 1694 if (Pos == Position::AFTER) 1695 --MI; 1696 1697 return Changed; 1698 } 1699 1700 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1701 SIAtomicScope Scope, 1702 SIAtomicAddrSpace AddrSpace, 1703 bool IsCrossAddrSpaceOrdering, 1704 Position Pos) const { 1705 bool Changed = false; 1706 1707 MachineBasicBlock &MBB = *MI->getParent(); 1708 DebugLoc DL = MI->getDebugLoc(); 1709 1710 if (Pos == Position::AFTER) 1711 ++MI; 1712 1713 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1714 switch (Scope) { 1715 case SIAtomicScope::SYSTEM: 1716 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1717 // hardware does not reorder memory operations by the same wave with 1718 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1719 // to initiate writeback of any dirty cache lines of earlier writes by the 1720 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1721 // writeback has completed. 1722 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1723 // Set SC bits to indicate system scope. 1724 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1725 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1726 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1727 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1728 Changed = true; 1729 break; 1730 case SIAtomicScope::AGENT: 1731 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1732 // Set SC bits to indicate agent scope. 1733 .addImm(AMDGPU::CPol::SC1); 1734 1735 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1736 // SIAtomicScope::AGENT, the following insertWait will generate the 1737 // required "S_WAITCNT vmcnt(0)". 1738 Changed = true; 1739 break; 1740 case SIAtomicScope::WORKGROUP: 1741 case SIAtomicScope::WAVEFRONT: 1742 case SIAtomicScope::SINGLETHREAD: 1743 // Do not generate "BUFFER_WBL2" as there are no caches it would 1744 // writeback, and would require an otherwise unnecessary 1745 // "S_WAITCNT vmcnt(0)". 1746 break; 1747 default: 1748 llvm_unreachable("Unsupported synchronization scope"); 1749 } 1750 } 1751 1752 if (Pos == Position::AFTER) 1753 --MI; 1754 1755 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1756 // S_WAITCNT needed. 1757 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1758 IsCrossAddrSpaceOrdering, Pos); 1759 1760 return Changed; 1761 } 1762 1763 bool SIGfx10CacheControl::enableLoadCacheBypass( 1764 const MachineBasicBlock::iterator &MI, 1765 SIAtomicScope Scope, 1766 SIAtomicAddrSpace AddrSpace) const { 1767 assert(MI->mayLoad() && !MI->mayStore()); 1768 bool Changed = false; 1769 1770 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1771 switch (Scope) { 1772 case SIAtomicScope::SYSTEM: 1773 case SIAtomicScope::AGENT: 1774 // Set the L0 and L1 cache policies to MISS_EVICT. 1775 // Note: there is no L2 cache coherent bypass control at the ISA level. 1776 Changed |= enableGLCBit(MI); 1777 Changed |= enableDLCBit(MI); 1778 break; 1779 case SIAtomicScope::WORKGROUP: 1780 // In WGP mode the waves of a work-group can be executing on either CU of 1781 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1782 // CU mode all waves of a work-group are on the same CU, and so the L0 1783 // does not need to be bypassed. 1784 if (!ST.isCuModeEnabled()) 1785 Changed |= enableGLCBit(MI); 1786 break; 1787 case SIAtomicScope::WAVEFRONT: 1788 case SIAtomicScope::SINGLETHREAD: 1789 // No cache to bypass. 1790 break; 1791 default: 1792 llvm_unreachable("Unsupported synchronization scope"); 1793 } 1794 } 1795 1796 /// The scratch address space does not need the global memory caches 1797 /// to be bypassed as all memory operations by the same thread are 1798 /// sequentially consistent, and no other thread can access scratch 1799 /// memory. 1800 1801 /// Other address spaces do not have a cache. 1802 1803 return Changed; 1804 } 1805 1806 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1807 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1808 bool IsVolatile, bool IsNonTemporal) const { 1809 1810 // Only handle load and store, not atomic read-modify-write insructions. The 1811 // latter use glc to indicate if the atomic returns a result and so must not 1812 // be used for cache control. 1813 assert(MI->mayLoad() ^ MI->mayStore()); 1814 1815 // Only update load and store, not LLVM IR atomic read-modify-write 1816 // instructions. The latter are always marked as volatile so cannot sensibly 1817 // handle it as do not want to pessimize all atomics. Also they do not support 1818 // the nontemporal attribute. 1819 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1820 1821 bool Changed = false; 1822 1823 if (IsVolatile) { 1824 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1825 // and MISS_LRU for store instructions. 1826 // Note: there is no L2 cache coherent bypass control at the ISA level. 1827 if (Op == SIMemOp::LOAD) { 1828 Changed |= enableGLCBit(MI); 1829 Changed |= enableDLCBit(MI); 1830 } 1831 1832 // Ensure operation has completed at system scope to cause all volatile 1833 // operations to be visible outside the program in a global order. Do not 1834 // request cross address space as only the global address space can be 1835 // observable outside the program, so no need to cause a waitcnt for LDS 1836 // address space operations. 1837 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1838 Position::AFTER); 1839 return Changed; 1840 } 1841 1842 if (IsNonTemporal) { 1843 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1844 // and L2 cache policy to STREAM. 1845 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1846 // to MISS_EVICT and the L2 cache policy to STREAM. 1847 if (Op == SIMemOp::STORE) 1848 Changed |= enableGLCBit(MI); 1849 Changed |= enableSLCBit(MI); 1850 1851 return Changed; 1852 } 1853 1854 return Changed; 1855 } 1856 1857 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1858 SIAtomicScope Scope, 1859 SIAtomicAddrSpace AddrSpace, 1860 SIMemOp Op, 1861 bool IsCrossAddrSpaceOrdering, 1862 Position Pos) const { 1863 bool Changed = false; 1864 1865 MachineBasicBlock &MBB = *MI->getParent(); 1866 DebugLoc DL = MI->getDebugLoc(); 1867 1868 if (Pos == Position::AFTER) 1869 ++MI; 1870 1871 bool VMCnt = false; 1872 bool VSCnt = false; 1873 bool LGKMCnt = false; 1874 1875 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1876 SIAtomicAddrSpace::NONE) { 1877 switch (Scope) { 1878 case SIAtomicScope::SYSTEM: 1879 case SIAtomicScope::AGENT: 1880 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1881 VMCnt |= true; 1882 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1883 VSCnt |= true; 1884 break; 1885 case SIAtomicScope::WORKGROUP: 1886 // In WGP mode the waves of a work-group can be executing on either CU of 1887 // the WGP. Therefore need to wait for operations to complete to ensure 1888 // they are visible to waves in the other CU as the L0 is per CU. 1889 // Otherwise in CU mode and all waves of a work-group are on the same CU 1890 // which shares the same L0. 1891 if (!ST.isCuModeEnabled()) { 1892 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1893 VMCnt |= true; 1894 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1895 VSCnt |= true; 1896 } 1897 break; 1898 case SIAtomicScope::WAVEFRONT: 1899 case SIAtomicScope::SINGLETHREAD: 1900 // The L0 cache keeps all memory operations in order for 1901 // work-items in the same wavefront. 1902 break; 1903 default: 1904 llvm_unreachable("Unsupported synchronization scope"); 1905 } 1906 } 1907 1908 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1909 switch (Scope) { 1910 case SIAtomicScope::SYSTEM: 1911 case SIAtomicScope::AGENT: 1912 case SIAtomicScope::WORKGROUP: 1913 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1914 // not needed as LDS operations for all waves are executed in a total 1915 // global ordering as observed by all waves. Required if also 1916 // synchronizing with global/GDS memory as LDS operations could be 1917 // reordered with respect to later global/GDS memory operations of the 1918 // same wave. 1919 LGKMCnt |= IsCrossAddrSpaceOrdering; 1920 break; 1921 case SIAtomicScope::WAVEFRONT: 1922 case SIAtomicScope::SINGLETHREAD: 1923 // The LDS keeps all memory operations in order for 1924 // the same wavefront. 1925 break; 1926 default: 1927 llvm_unreachable("Unsupported synchronization scope"); 1928 } 1929 } 1930 1931 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1932 switch (Scope) { 1933 case SIAtomicScope::SYSTEM: 1934 case SIAtomicScope::AGENT: 1935 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1936 // is not needed as GDS operations for all waves are executed in a total 1937 // global ordering as observed by all waves. Required if also 1938 // synchronizing with global/LDS memory as GDS operations could be 1939 // reordered with respect to later global/LDS memory operations of the 1940 // same wave. 1941 LGKMCnt |= IsCrossAddrSpaceOrdering; 1942 break; 1943 case SIAtomicScope::WORKGROUP: 1944 case SIAtomicScope::WAVEFRONT: 1945 case SIAtomicScope::SINGLETHREAD: 1946 // The GDS keeps all memory operations in order for 1947 // the same work-group. 1948 break; 1949 default: 1950 llvm_unreachable("Unsupported synchronization scope"); 1951 } 1952 } 1953 1954 if (VMCnt || LGKMCnt) { 1955 unsigned WaitCntImmediate = 1956 AMDGPU::encodeWaitcnt(IV, 1957 VMCnt ? 0 : getVmcntBitMask(IV), 1958 getExpcntBitMask(IV), 1959 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1960 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1961 Changed = true; 1962 } 1963 1964 if (VSCnt) { 1965 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1966 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1967 .addImm(0); 1968 Changed = true; 1969 } 1970 1971 if (Pos == Position::AFTER) 1972 --MI; 1973 1974 return Changed; 1975 } 1976 1977 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1978 SIAtomicScope Scope, 1979 SIAtomicAddrSpace AddrSpace, 1980 Position Pos) const { 1981 if (!InsertCacheInv) 1982 return false; 1983 1984 bool Changed = false; 1985 1986 MachineBasicBlock &MBB = *MI->getParent(); 1987 DebugLoc DL = MI->getDebugLoc(); 1988 1989 if (Pos == Position::AFTER) 1990 ++MI; 1991 1992 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1993 switch (Scope) { 1994 case SIAtomicScope::SYSTEM: 1995 case SIAtomicScope::AGENT: 1996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1997 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1998 Changed = true; 1999 break; 2000 case SIAtomicScope::WORKGROUP: 2001 // In WGP mode the waves of a work-group can be executing on either CU of 2002 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2003 // in CU mode and all waves of a work-group are on the same CU, and so the 2004 // L0 does not need to be invalidated. 2005 if (!ST.isCuModeEnabled()) { 2006 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2007 Changed = true; 2008 } 2009 break; 2010 case SIAtomicScope::WAVEFRONT: 2011 case SIAtomicScope::SINGLETHREAD: 2012 // No cache to invalidate. 2013 break; 2014 default: 2015 llvm_unreachable("Unsupported synchronization scope"); 2016 } 2017 } 2018 2019 /// The scratch address space does not need the global memory cache 2020 /// to be flushed as all memory operations by the same thread are 2021 /// sequentially consistent, and no other thread can access scratch 2022 /// memory. 2023 2024 /// Other address spaces do not have a cache. 2025 2026 if (Pos == Position::AFTER) 2027 --MI; 2028 2029 return Changed; 2030 } 2031 2032 bool SIGfx11CacheControl::enableLoadCacheBypass( 2033 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2034 SIAtomicAddrSpace AddrSpace) const { 2035 assert(MI->mayLoad() && !MI->mayStore()); 2036 bool Changed = false; 2037 2038 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2039 switch (Scope) { 2040 case SIAtomicScope::SYSTEM: 2041 case SIAtomicScope::AGENT: 2042 // Set the L0 and L1 cache policies to MISS_EVICT. 2043 // Note: there is no L2 cache coherent bypass control at the ISA level. 2044 Changed |= enableGLCBit(MI); 2045 break; 2046 case SIAtomicScope::WORKGROUP: 2047 // In WGP mode the waves of a work-group can be executing on either CU of 2048 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2049 // CU mode all waves of a work-group are on the same CU, and so the L0 2050 // does not need to be bypassed. 2051 if (!ST.isCuModeEnabled()) 2052 Changed |= enableGLCBit(MI); 2053 break; 2054 case SIAtomicScope::WAVEFRONT: 2055 case SIAtomicScope::SINGLETHREAD: 2056 // No cache to bypass. 2057 break; 2058 default: 2059 llvm_unreachable("Unsupported synchronization scope"); 2060 } 2061 } 2062 2063 /// The scratch address space does not need the global memory caches 2064 /// to be bypassed as all memory operations by the same thread are 2065 /// sequentially consistent, and no other thread can access scratch 2066 /// memory. 2067 2068 /// Other address spaces do not have a cache. 2069 2070 return Changed; 2071 } 2072 2073 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2074 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2075 bool IsVolatile, bool IsNonTemporal) const { 2076 2077 // Only handle load and store, not atomic read-modify-write insructions. The 2078 // latter use glc to indicate if the atomic returns a result and so must not 2079 // be used for cache control. 2080 assert(MI->mayLoad() ^ MI->mayStore()); 2081 2082 // Only update load and store, not LLVM IR atomic read-modify-write 2083 // instructions. The latter are always marked as volatile so cannot sensibly 2084 // handle it as do not want to pessimize all atomics. Also they do not support 2085 // the nontemporal attribute. 2086 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2087 2088 bool Changed = false; 2089 2090 if (IsVolatile) { 2091 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2092 // and MISS_LRU for store instructions. 2093 // Note: there is no L2 cache coherent bypass control at the ISA level. 2094 if (Op == SIMemOp::LOAD) 2095 Changed |= enableGLCBit(MI); 2096 2097 // Set MALL NOALLOC for load and store instructions. 2098 Changed |= enableDLCBit(MI); 2099 2100 // Ensure operation has completed at system scope to cause all volatile 2101 // operations to be visible outside the program in a global order. Do not 2102 // request cross address space as only the global address space can be 2103 // observable outside the program, so no need to cause a waitcnt for LDS 2104 // address space operations. 2105 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2106 Position::AFTER); 2107 return Changed; 2108 } 2109 2110 if (IsNonTemporal) { 2111 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2112 // and L2 cache policy to STREAM. 2113 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2114 // to MISS_EVICT and the L2 cache policy to STREAM. 2115 if (Op == SIMemOp::STORE) 2116 Changed |= enableGLCBit(MI); 2117 Changed |= enableSLCBit(MI); 2118 2119 // Set MALL NOALLOC for load and store instructions. 2120 Changed |= enableDLCBit(MI); 2121 return Changed; 2122 } 2123 2124 return Changed; 2125 } 2126 2127 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2128 if (AtomicPseudoMIs.empty()) 2129 return false; 2130 2131 for (auto &MI : AtomicPseudoMIs) 2132 MI->eraseFromParent(); 2133 2134 AtomicPseudoMIs.clear(); 2135 return true; 2136 } 2137 2138 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2139 MachineBasicBlock::iterator &MI) { 2140 assert(MI->mayLoad() && !MI->mayStore()); 2141 2142 bool Changed = false; 2143 2144 if (MOI.isAtomic()) { 2145 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2146 MOI.getOrdering() == AtomicOrdering::Acquire || 2147 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2148 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2149 MOI.getOrderingAddrSpace()); 2150 } 2151 2152 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2153 Changed |= CC->insertWait(MI, MOI.getScope(), 2154 MOI.getOrderingAddrSpace(), 2155 SIMemOp::LOAD | SIMemOp::STORE, 2156 MOI.getIsCrossAddressSpaceOrdering(), 2157 Position::BEFORE); 2158 2159 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2160 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2161 Changed |= CC->insertWait(MI, MOI.getScope(), 2162 MOI.getInstrAddrSpace(), 2163 SIMemOp::LOAD, 2164 MOI.getIsCrossAddressSpaceOrdering(), 2165 Position::AFTER); 2166 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2167 MOI.getOrderingAddrSpace(), 2168 Position::AFTER); 2169 } 2170 2171 return Changed; 2172 } 2173 2174 // Atomic instructions already bypass caches to the scope specified by the 2175 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2176 // need additional treatment. 2177 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2178 SIMemOp::LOAD, MOI.isVolatile(), 2179 MOI.isNonTemporal()); 2180 return Changed; 2181 } 2182 2183 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2184 MachineBasicBlock::iterator &MI) { 2185 assert(!MI->mayLoad() && MI->mayStore()); 2186 2187 bool Changed = false; 2188 2189 if (MOI.isAtomic()) { 2190 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2191 MOI.getOrdering() == AtomicOrdering::Release || 2192 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2193 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2194 MOI.getOrderingAddrSpace()); 2195 } 2196 2197 if (MOI.getOrdering() == AtomicOrdering::Release || 2198 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2199 Changed |= CC->insertRelease(MI, MOI.getScope(), 2200 MOI.getOrderingAddrSpace(), 2201 MOI.getIsCrossAddressSpaceOrdering(), 2202 Position::BEFORE); 2203 2204 return Changed; 2205 } 2206 2207 // Atomic instructions already bypass caches to the scope specified by the 2208 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2209 // need additional treatment. 2210 Changed |= CC->enableVolatileAndOrNonTemporal( 2211 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2212 MOI.isNonTemporal()); 2213 return Changed; 2214 } 2215 2216 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2217 MachineBasicBlock::iterator &MI) { 2218 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2219 2220 AtomicPseudoMIs.push_back(MI); 2221 bool Changed = false; 2222 2223 if (MOI.isAtomic()) { 2224 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2225 MOI.getOrdering() == AtomicOrdering::Release || 2226 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2227 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2228 /// TODO: This relies on a barrier always generating a waitcnt 2229 /// for LDS to ensure it is not reordered with the completion of 2230 /// the proceeding LDS operations. If barrier had a memory 2231 /// ordering and memory scope, then library does not need to 2232 /// generate a fence. Could add support in this file for 2233 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2234 /// adding S_WAITCNT before a S_BARRIER. 2235 Changed |= CC->insertRelease(MI, MOI.getScope(), 2236 MOI.getOrderingAddrSpace(), 2237 MOI.getIsCrossAddressSpaceOrdering(), 2238 Position::BEFORE); 2239 2240 // TODO: If both release and invalidate are happening they could be combined 2241 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2242 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2243 // track cache invalidate and write back instructions. 2244 2245 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2246 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2247 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2248 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2249 MOI.getOrderingAddrSpace(), 2250 Position::BEFORE); 2251 2252 return Changed; 2253 } 2254 2255 return Changed; 2256 } 2257 2258 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2259 MachineBasicBlock::iterator &MI) { 2260 assert(MI->mayLoad() && MI->mayStore()); 2261 2262 bool Changed = false; 2263 2264 if (MOI.isAtomic()) { 2265 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2266 MOI.getOrdering() == AtomicOrdering::Acquire || 2267 MOI.getOrdering() == AtomicOrdering::Release || 2268 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2269 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2270 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2271 MOI.getInstrAddrSpace()); 2272 } 2273 2274 if (MOI.getOrdering() == AtomicOrdering::Release || 2275 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2276 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2277 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2278 Changed |= CC->insertRelease(MI, MOI.getScope(), 2279 MOI.getOrderingAddrSpace(), 2280 MOI.getIsCrossAddressSpaceOrdering(), 2281 Position::BEFORE); 2282 2283 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2284 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2285 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2286 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2287 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2288 Changed |= CC->insertWait(MI, MOI.getScope(), 2289 MOI.getInstrAddrSpace(), 2290 isAtomicRet(*MI) ? SIMemOp::LOAD : 2291 SIMemOp::STORE, 2292 MOI.getIsCrossAddressSpaceOrdering(), 2293 Position::AFTER); 2294 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2295 MOI.getOrderingAddrSpace(), 2296 Position::AFTER); 2297 } 2298 2299 return Changed; 2300 } 2301 2302 return Changed; 2303 } 2304 2305 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2306 bool Changed = false; 2307 2308 SIMemOpAccess MOA(MF); 2309 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2310 2311 for (auto &MBB : MF) { 2312 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2313 2314 // Unbundle instructions after the post-RA scheduler. 2315 if (MI->isBundle() && MI->mayLoadOrStore()) { 2316 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2317 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2318 I != E && I->isBundledWithPred(); ++I) { 2319 I->unbundleFromPred(); 2320 for (MachineOperand &MO : I->operands()) 2321 if (MO.isReg()) 2322 MO.setIsInternalRead(false); 2323 } 2324 2325 MI->eraseFromParent(); 2326 MI = II->getIterator(); 2327 } 2328 2329 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2330 continue; 2331 2332 if (const auto &MOI = MOA.getLoadInfo(MI)) 2333 Changed |= expandLoad(MOI.value(), MI); 2334 else if (const auto &MOI = MOA.getStoreInfo(MI)) 2335 Changed |= expandStore(MOI.value(), MI); 2336 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2337 Changed |= expandAtomicFence(MOI.value(), MI); 2338 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2339 Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI); 2340 } 2341 } 2342 2343 Changed |= removeAtomicPseudoMIs(); 2344 return Changed; 2345 } 2346 2347 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2348 2349 char SIMemoryLegalizer::ID = 0; 2350 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2351 2352 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2353 return new SIMemoryLegalizer(); 2354 } 2355