1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE); 130 131 // There is also no cross address space ordering if the ordering 132 // address space is the same as the instruction address space and 133 // only contains a single address space. 134 if ((OrderingAddrSpace == InstrAddrSpace) && 135 isPowerOf2_32(uint32_t(InstrAddrSpace))) 136 this->IsCrossAddressSpaceOrdering = false; 137 138 // Limit the scope to the maximum supported by the instruction's address 139 // spaces. 140 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 141 SIAtomicAddrSpace::NONE) { 142 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 143 } else if ((InstrAddrSpace & 144 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 145 SIAtomicAddrSpace::NONE) { 146 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 147 } else if ((InstrAddrSpace & 148 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 149 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 150 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 151 } 152 } 153 154 public: 155 /// \returns Atomic synchronization scope of the machine instruction used to 156 /// create this SIMemOpInfo. 157 SIAtomicScope getScope() const { 158 return Scope; 159 } 160 161 /// \returns Ordering constraint of the machine instruction used to 162 /// create this SIMemOpInfo. 163 AtomicOrdering getOrdering() const { 164 return Ordering; 165 } 166 167 /// \returns Failure ordering constraint of the machine instruction used to 168 /// create this SIMemOpInfo. 169 AtomicOrdering getFailureOrdering() const { 170 return FailureOrdering; 171 } 172 173 /// \returns The address spaces be accessed by the machine 174 /// instruction used to create this SiMemOpInfo. 175 SIAtomicAddrSpace getInstrAddrSpace() const { 176 return InstrAddrSpace; 177 } 178 179 /// \returns The address spaces that must be ordered by the machine 180 /// instruction used to create this SiMemOpInfo. 181 SIAtomicAddrSpace getOrderingAddrSpace() const { 182 return OrderingAddrSpace; 183 } 184 185 /// \returns Return true iff memory ordering of operations on 186 /// different address spaces is required. 187 bool getIsCrossAddressSpaceOrdering() const { 188 return IsCrossAddressSpaceOrdering; 189 } 190 191 /// \returns True if memory access of the machine instruction used to 192 /// create this SIMemOpInfo is volatile, false otherwise. 193 bool isVolatile() const { 194 return IsVolatile; 195 } 196 197 /// \returns True if memory access of the machine instruction used to 198 /// create this SIMemOpInfo is nontemporal, false otherwise. 199 bool isNonTemporal() const { 200 return IsNonTemporal; 201 } 202 203 /// \returns True if ordering constraint of the machine instruction used to 204 /// create this SIMemOpInfo is unordered or higher, false otherwise. 205 bool isAtomic() const { 206 return Ordering != AtomicOrdering::NotAtomic; 207 } 208 209 }; 210 211 class SIMemOpAccess final { 212 private: 213 AMDGPUMachineModuleInfo *MMI = nullptr; 214 215 /// Reports unsupported message \p Msg for \p MI to LLVM context. 216 void reportUnsupported(const MachineBasicBlock::iterator &MI, 217 const char *Msg) const; 218 219 /// Inspects the target synchronization scope \p SSID and determines 220 /// the SI atomic scope it corresponds to, the address spaces it 221 /// covers, and whether the memory ordering applies between address 222 /// spaces. 223 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 224 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 225 226 /// \return Return a bit set of the address spaces accessed by \p AS. 227 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 228 229 /// \returns Info constructed from \p MI, which has at least machine memory 230 /// operand. 231 Optional<SIMemOpInfo> constructFromMIWithMMO( 232 const MachineBasicBlock::iterator &MI) const; 233 234 public: 235 /// Construct class to support accessing the machine memory operands 236 /// of instructions in the machine function \p MF. 237 SIMemOpAccess(MachineFunction &MF); 238 239 /// \returns Load info if \p MI is a load operation, "None" otherwise. 240 Optional<SIMemOpInfo> getLoadInfo( 241 const MachineBasicBlock::iterator &MI) const; 242 243 /// \returns Store info if \p MI is a store operation, "None" otherwise. 244 Optional<SIMemOpInfo> getStoreInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic fence info if \p MI is an atomic fence operation, 248 /// "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicFenceInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 252 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 253 /// rmw operation, "None" otherwise. 254 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 255 const MachineBasicBlock::iterator &MI) const; 256 }; 257 258 class SICacheControl { 259 protected: 260 261 /// AMDGPU subtarget info. 262 const GCNSubtarget &ST; 263 264 /// Instruction info. 265 const SIInstrInfo *TII = nullptr; 266 267 IsaVersion IV; 268 269 /// Whether to insert cache invalidating instructions. 270 bool InsertCacheInv; 271 272 SICacheControl(const GCNSubtarget &ST); 273 274 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 275 /// \returns Returns true if \p MI is modified, false otherwise. 276 bool enableNamedBit(const MachineBasicBlock::iterator MI, 277 AMDGPU::CPol::CPol Bit) const; 278 279 public: 280 281 /// Create a cache control for the subtarget \p ST. 282 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 283 284 /// Update \p MI memory load instruction to bypass any caches up to 285 /// the \p Scope memory scope for address spaces \p 286 /// AddrSpace. Return true iff the instruction was modified. 287 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 288 SIAtomicScope Scope, 289 SIAtomicAddrSpace AddrSpace) const = 0; 290 291 /// Update \p MI memory store instruction to bypass any caches up to 292 /// the \p Scope memory scope for address spaces \p 293 /// AddrSpace. Return true iff the instruction was modified. 294 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 295 SIAtomicScope Scope, 296 SIAtomicAddrSpace AddrSpace) const = 0; 297 298 /// Update \p MI memory read-modify-write instruction to bypass any caches up 299 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 300 /// iff the instruction was modified. 301 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 302 SIAtomicScope Scope, 303 SIAtomicAddrSpace AddrSpace) const = 0; 304 305 /// Update \p MI memory instruction of kind \p Op associated with address 306 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 307 /// true iff the instruction was modified. 308 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 309 SIAtomicAddrSpace AddrSpace, 310 SIMemOp Op, bool IsVolatile, 311 bool IsNonTemporal) const = 0; 312 313 /// Inserts any necessary instructions at position \p Pos relative 314 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 315 /// \p Op associated with address spaces \p AddrSpace have completed. Used 316 /// between memory instructions to enforce the order they become visible as 317 /// observed by other memory instructions executing in memory scope \p Scope. 318 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 319 /// address spaces. Returns true iff any instructions inserted. 320 virtual bool insertWait(MachineBasicBlock::iterator &MI, 321 SIAtomicScope Scope, 322 SIAtomicAddrSpace AddrSpace, 323 SIMemOp Op, 324 bool IsCrossAddrSpaceOrdering, 325 Position Pos) const = 0; 326 327 /// Inserts any necessary instructions at position \p Pos relative to 328 /// instruction \p MI to ensure any subsequent memory instructions of this 329 /// thread with address spaces \p AddrSpace will observe the previous memory 330 /// operations by any thread for memory scopes up to memory scope \p Scope . 331 /// Returns true iff any instructions inserted. 332 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 333 SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, 335 Position Pos) const = 0; 336 337 /// Inserts any necessary instructions at position \p Pos relative to 338 /// instruction \p MI to ensure previous memory instructions by this thread 339 /// with address spaces \p AddrSpace have completed and can be observed by 340 /// subsequent memory instructions by any thread executing in memory scope \p 341 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 342 /// between address spaces. Returns true iff any instructions inserted. 343 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace, 346 bool IsCrossAddrSpaceOrdering, 347 Position Pos) const = 0; 348 349 /// Virtual destructor to allow derivations to be deleted. 350 virtual ~SICacheControl() = default; 351 352 }; 353 354 class SIGfx6CacheControl : public SICacheControl { 355 protected: 356 357 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 358 /// is modified, false otherwise. 359 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 360 return enableNamedBit(MI, AMDGPU::CPol::GLC); 361 } 362 363 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::SLC); 367 } 368 369 public: 370 371 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 372 373 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace) const override; 376 377 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 382 SIAtomicScope Scope, 383 SIAtomicAddrSpace AddrSpace) const override; 384 385 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 386 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 387 bool IsVolatile, 388 bool IsNonTemporal) const override; 389 390 bool insertWait(MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace, 393 SIMemOp Op, 394 bool IsCrossAddrSpaceOrdering, 395 Position Pos) const override; 396 397 bool insertAcquire(MachineBasicBlock::iterator &MI, 398 SIAtomicScope Scope, 399 SIAtomicAddrSpace AddrSpace, 400 Position Pos) const override; 401 402 bool insertRelease(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 bool IsCrossAddrSpaceOrdering, 406 Position Pos) const override; 407 }; 408 409 class SIGfx7CacheControl : public SIGfx6CacheControl { 410 public: 411 412 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 }; 420 421 class SIGfx90ACacheControl : public SIGfx7CacheControl { 422 public: 423 424 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 425 426 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 427 SIAtomicScope Scope, 428 SIAtomicAddrSpace AddrSpace) const override; 429 430 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 431 SIAtomicScope Scope, 432 SIAtomicAddrSpace AddrSpace) const override; 433 434 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 435 SIAtomicScope Scope, 436 SIAtomicAddrSpace AddrSpace) const override; 437 438 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 439 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 440 bool IsVolatile, 441 bool IsNonTemporal) const override; 442 443 bool insertWait(MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace, 446 SIMemOp Op, 447 bool IsCrossAddrSpaceOrdering, 448 Position Pos) const override; 449 450 bool insertAcquire(MachineBasicBlock::iterator &MI, 451 SIAtomicScope Scope, 452 SIAtomicAddrSpace AddrSpace, 453 Position Pos) const override; 454 455 bool insertRelease(MachineBasicBlock::iterator &MI, 456 SIAtomicScope Scope, 457 SIAtomicAddrSpace AddrSpace, 458 bool IsCrossAddrSpaceOrdering, 459 Position Pos) const override; 460 }; 461 462 class SIGfx10CacheControl : public SIGfx7CacheControl { 463 protected: 464 465 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 466 /// is modified, false otherwise. 467 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 468 return enableNamedBit(MI, AMDGPU::CPol::DLC); 469 } 470 471 public: 472 473 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 474 475 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 476 SIAtomicScope Scope, 477 SIAtomicAddrSpace AddrSpace) const override; 478 479 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 480 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 481 bool IsVolatile, 482 bool IsNonTemporal) const override; 483 484 bool insertWait(MachineBasicBlock::iterator &MI, 485 SIAtomicScope Scope, 486 SIAtomicAddrSpace AddrSpace, 487 SIMemOp Op, 488 bool IsCrossAddrSpaceOrdering, 489 Position Pos) const override; 490 491 bool insertAcquire(MachineBasicBlock::iterator &MI, 492 SIAtomicScope Scope, 493 SIAtomicAddrSpace AddrSpace, 494 Position Pos) const override; 495 }; 496 497 class SIMemoryLegalizer final : public MachineFunctionPass { 498 private: 499 500 /// Cache Control. 501 std::unique_ptr<SICacheControl> CC = nullptr; 502 503 /// List of atomic pseudo instructions. 504 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 505 506 /// Return true iff instruction \p MI is a atomic instruction that 507 /// returns a result. 508 bool isAtomicRet(const MachineInstr &MI) const { 509 return SIInstrInfo::isAtomicRet(MI); 510 } 511 512 /// Removes all processed atomic pseudo instructions from the current 513 /// function. Returns true if current function is modified, false otherwise. 514 bool removeAtomicPseudoMIs(); 515 516 /// Expands load operation \p MI. Returns true if instructions are 517 /// added/deleted or \p MI is modified, false otherwise. 518 bool expandLoad(const SIMemOpInfo &MOI, 519 MachineBasicBlock::iterator &MI); 520 /// Expands store operation \p MI. Returns true if instructions are 521 /// added/deleted or \p MI is modified, false otherwise. 522 bool expandStore(const SIMemOpInfo &MOI, 523 MachineBasicBlock::iterator &MI); 524 /// Expands atomic fence operation \p MI. Returns true if 525 /// instructions are added/deleted or \p MI is modified, false otherwise. 526 bool expandAtomicFence(const SIMemOpInfo &MOI, 527 MachineBasicBlock::iterator &MI); 528 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 529 /// instructions are added/deleted or \p MI is modified, false otherwise. 530 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 531 MachineBasicBlock::iterator &MI); 532 533 public: 534 static char ID; 535 536 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 537 538 void getAnalysisUsage(AnalysisUsage &AU) const override { 539 AU.setPreservesCFG(); 540 MachineFunctionPass::getAnalysisUsage(AU); 541 } 542 543 StringRef getPassName() const override { 544 return PASS_NAME; 545 } 546 547 bool runOnMachineFunction(MachineFunction &MF) override; 548 }; 549 550 } // end namespace anonymous 551 552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 553 const char *Msg) const { 554 const Function &Func = MI->getParent()->getParent()->getFunction(); 555 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 556 Func.getContext().diagnose(Diag); 557 } 558 559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 561 SIAtomicAddrSpace InstrAddrSpace) const { 562 if (SSID == SyncScope::System) 563 return std::make_tuple(SIAtomicScope::SYSTEM, 564 SIAtomicAddrSpace::ATOMIC, 565 true); 566 if (SSID == MMI->getAgentSSID()) 567 return std::make_tuple(SIAtomicScope::AGENT, 568 SIAtomicAddrSpace::ATOMIC, 569 true); 570 if (SSID == MMI->getWorkgroupSSID()) 571 return std::make_tuple(SIAtomicScope::WORKGROUP, 572 SIAtomicAddrSpace::ATOMIC, 573 true); 574 if (SSID == MMI->getWavefrontSSID()) 575 return std::make_tuple(SIAtomicScope::WAVEFRONT, 576 SIAtomicAddrSpace::ATOMIC, 577 true); 578 if (SSID == SyncScope::SingleThread) 579 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 580 SIAtomicAddrSpace::ATOMIC, 581 true); 582 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 583 return std::make_tuple(SIAtomicScope::SYSTEM, 584 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 585 false); 586 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 587 return std::make_tuple(SIAtomicScope::AGENT, 588 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 589 false); 590 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 591 return std::make_tuple(SIAtomicScope::WORKGROUP, 592 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 593 false); 594 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 595 return std::make_tuple(SIAtomicScope::WAVEFRONT, 596 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 597 false); 598 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 599 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 600 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 601 false); 602 return None; 603 } 604 605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 606 if (AS == AMDGPUAS::FLAT_ADDRESS) 607 return SIAtomicAddrSpace::FLAT; 608 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 609 return SIAtomicAddrSpace::GLOBAL; 610 if (AS == AMDGPUAS::LOCAL_ADDRESS) 611 return SIAtomicAddrSpace::LDS; 612 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 613 return SIAtomicAddrSpace::SCRATCH; 614 if (AS == AMDGPUAS::REGION_ADDRESS) 615 return SIAtomicAddrSpace::GDS; 616 617 return SIAtomicAddrSpace::OTHER; 618 } 619 620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 621 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 622 } 623 624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 625 const MachineBasicBlock::iterator &MI) const { 626 assert(MI->getNumMemOperands() > 0); 627 628 SyncScope::ID SSID = SyncScope::SingleThread; 629 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 630 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 631 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 632 bool IsNonTemporal = true; 633 bool IsVolatile = false; 634 635 // Validator should check whether or not MMOs cover the entire set of 636 // locations accessed by the memory instruction. 637 for (const auto &MMO : MI->memoperands()) { 638 IsNonTemporal &= MMO->isNonTemporal(); 639 IsVolatile |= MMO->isVolatile(); 640 InstrAddrSpace |= 641 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 642 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 643 if (OpOrdering != AtomicOrdering::NotAtomic) { 644 const auto &IsSyncScopeInclusion = 645 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 646 if (!IsSyncScopeInclusion) { 647 reportUnsupported(MI, 648 "Unsupported non-inclusive atomic synchronization scope"); 649 return None; 650 } 651 652 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 653 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 654 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 655 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 656 FailureOrdering = 657 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 658 } 659 } 660 661 SIAtomicScope Scope = SIAtomicScope::NONE; 662 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 663 bool IsCrossAddressSpaceOrdering = false; 664 if (Ordering != AtomicOrdering::NotAtomic) { 665 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 666 if (!ScopeOrNone) { 667 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 668 return None; 669 } 670 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 671 ScopeOrNone.getValue(); 672 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 673 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 674 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 675 reportUnsupported(MI, "Unsupported atomic address space"); 676 return None; 677 } 678 } 679 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 680 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 681 IsNonTemporal); 682 } 683 684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 685 const MachineBasicBlock::iterator &MI) const { 686 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 687 688 if (!(MI->mayLoad() && !MI->mayStore())) 689 return None; 690 691 // Be conservative if there are no memory operands. 692 if (MI->getNumMemOperands() == 0) 693 return SIMemOpInfo(); 694 695 return constructFromMIWithMMO(MI); 696 } 697 698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 699 const MachineBasicBlock::iterator &MI) const { 700 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 701 702 if (!(!MI->mayLoad() && MI->mayStore())) 703 return None; 704 705 // Be conservative if there are no memory operands. 706 if (MI->getNumMemOperands() == 0) 707 return SIMemOpInfo(); 708 709 return constructFromMIWithMMO(MI); 710 } 711 712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 713 const MachineBasicBlock::iterator &MI) const { 714 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 715 716 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 717 return None; 718 719 AtomicOrdering Ordering = 720 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 721 722 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 723 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 724 if (!ScopeOrNone) { 725 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 726 return None; 727 } 728 729 SIAtomicScope Scope = SIAtomicScope::NONE; 730 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 731 bool IsCrossAddressSpaceOrdering = false; 732 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 733 ScopeOrNone.getValue(); 734 735 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 736 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 737 reportUnsupported(MI, "Unsupported atomic address space"); 738 return None; 739 } 740 741 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 742 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 743 } 744 745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 746 const MachineBasicBlock::iterator &MI) const { 747 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 748 749 if (!(MI->mayLoad() && MI->mayStore())) 750 return None; 751 752 // Be conservative if there are no memory operands. 753 if (MI->getNumMemOperands() == 0) 754 return SIMemOpInfo(); 755 756 return constructFromMIWithMMO(MI); 757 } 758 759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 760 TII = ST.getInstrInfo(); 761 IV = getIsaVersion(ST.getCPU()); 762 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 763 } 764 765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 766 AMDGPU::CPol::CPol Bit) const { 767 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 768 if (!CPol) 769 return false; 770 771 CPol->setImm(CPol->getImm() | Bit); 772 return true; 773 } 774 775 /* static */ 776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 777 GCNSubtarget::Generation Generation = ST.getGeneration(); 778 if (ST.hasGFX90AInsts()) 779 return std::make_unique<SIGfx90ACacheControl>(ST); 780 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 781 return std::make_unique<SIGfx6CacheControl>(ST); 782 if (Generation < AMDGPUSubtarget::GFX10) 783 return std::make_unique<SIGfx7CacheControl>(ST); 784 return std::make_unique<SIGfx10CacheControl>(ST); 785 } 786 787 bool SIGfx6CacheControl::enableLoadCacheBypass( 788 const MachineBasicBlock::iterator &MI, 789 SIAtomicScope Scope, 790 SIAtomicAddrSpace AddrSpace) const { 791 assert(MI->mayLoad() && !MI->mayStore()); 792 bool Changed = false; 793 794 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 795 switch (Scope) { 796 case SIAtomicScope::SYSTEM: 797 case SIAtomicScope::AGENT: 798 Changed |= enableGLCBit(MI); 799 break; 800 case SIAtomicScope::WORKGROUP: 801 case SIAtomicScope::WAVEFRONT: 802 case SIAtomicScope::SINGLETHREAD: 803 // No cache to bypass. 804 break; 805 default: 806 llvm_unreachable("Unsupported synchronization scope"); 807 } 808 } 809 810 /// The scratch address space does not need the global memory caches 811 /// to be bypassed as all memory operations by the same thread are 812 /// sequentially consistent, and no other thread can access scratch 813 /// memory. 814 815 /// Other address spaces do not have a cache. 816 817 return Changed; 818 } 819 820 bool SIGfx6CacheControl::enableStoreCacheBypass( 821 const MachineBasicBlock::iterator &MI, 822 SIAtomicScope Scope, 823 SIAtomicAddrSpace AddrSpace) const { 824 assert(!MI->mayLoad() && MI->mayStore()); 825 bool Changed = false; 826 827 /// The L1 cache is write through so does not need to be bypassed. There is no 828 /// bypass control for the L2 cache at the isa level. 829 830 return Changed; 831 } 832 833 bool SIGfx6CacheControl::enableRMWCacheBypass( 834 const MachineBasicBlock::iterator &MI, 835 SIAtomicScope Scope, 836 SIAtomicAddrSpace AddrSpace) const { 837 assert(MI->mayLoad() && MI->mayStore()); 838 bool Changed = false; 839 840 /// The L1 cache is write through so does not need to be bypassed. There is no 841 /// bypass control for the L2 cache at the isa level. 842 843 return Changed; 844 } 845 846 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 847 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 848 bool IsVolatile, bool IsNonTemporal) const { 849 // Only handle load and store, not atomic read-modify-write insructions. The 850 // latter use glc to indicate if the atomic returns a result and so must not 851 // be used for cache control. 852 assert(MI->mayLoad() ^ MI->mayStore()); 853 854 // Only update load and store, not LLVM IR atomic read-modify-write 855 // instructions. The latter are always marked as volatile so cannot sensibly 856 // handle it as do not want to pessimize all atomics. Also they do not support 857 // the nontemporal attribute. 858 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 859 860 bool Changed = false; 861 862 if (IsVolatile) { 863 if (Op == SIMemOp::LOAD) 864 Changed |= enableGLCBit(MI); 865 866 // Ensure operation has completed at system scope to cause all volatile 867 // operations to be visible outside the program in a global order. Do not 868 // request cross address space as only the global address space can be 869 // observable outside the program, so no need to cause a waitcnt for LDS 870 // address space operations. 871 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 872 Position::AFTER); 873 874 return Changed; 875 } 876 877 if (IsNonTemporal) { 878 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 879 Changed |= enableGLCBit(MI); 880 Changed |= enableSLCBit(MI); 881 return Changed; 882 } 883 884 return Changed; 885 } 886 887 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 888 SIAtomicScope Scope, 889 SIAtomicAddrSpace AddrSpace, 890 SIMemOp Op, 891 bool IsCrossAddrSpaceOrdering, 892 Position Pos) const { 893 bool Changed = false; 894 895 MachineBasicBlock &MBB = *MI->getParent(); 896 DebugLoc DL = MI->getDebugLoc(); 897 898 if (Pos == Position::AFTER) 899 ++MI; 900 901 bool VMCnt = false; 902 bool LGKMCnt = false; 903 904 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 905 SIAtomicAddrSpace::NONE) { 906 switch (Scope) { 907 case SIAtomicScope::SYSTEM: 908 case SIAtomicScope::AGENT: 909 VMCnt |= true; 910 break; 911 case SIAtomicScope::WORKGROUP: 912 case SIAtomicScope::WAVEFRONT: 913 case SIAtomicScope::SINGLETHREAD: 914 // The L1 cache keeps all memory operations in order for 915 // wavefronts in the same work-group. 916 break; 917 default: 918 llvm_unreachable("Unsupported synchronization scope"); 919 } 920 } 921 922 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 923 switch (Scope) { 924 case SIAtomicScope::SYSTEM: 925 case SIAtomicScope::AGENT: 926 case SIAtomicScope::WORKGROUP: 927 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 928 // not needed as LDS operations for all waves are executed in a total 929 // global ordering as observed by all waves. Required if also 930 // synchronizing with global/GDS memory as LDS operations could be 931 // reordered with respect to later global/GDS memory operations of the 932 // same wave. 933 LGKMCnt |= IsCrossAddrSpaceOrdering; 934 break; 935 case SIAtomicScope::WAVEFRONT: 936 case SIAtomicScope::SINGLETHREAD: 937 // The LDS keeps all memory operations in order for 938 // the same wavesfront. 939 break; 940 default: 941 llvm_unreachable("Unsupported synchronization scope"); 942 } 943 } 944 945 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 946 switch (Scope) { 947 case SIAtomicScope::SYSTEM: 948 case SIAtomicScope::AGENT: 949 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 950 // is not needed as GDS operations for all waves are executed in a total 951 // global ordering as observed by all waves. Required if also 952 // synchronizing with global/LDS memory as GDS operations could be 953 // reordered with respect to later global/LDS memory operations of the 954 // same wave. 955 LGKMCnt |= IsCrossAddrSpaceOrdering; 956 break; 957 case SIAtomicScope::WORKGROUP: 958 case SIAtomicScope::WAVEFRONT: 959 case SIAtomicScope::SINGLETHREAD: 960 // The GDS keeps all memory operations in order for 961 // the same work-group. 962 break; 963 default: 964 llvm_unreachable("Unsupported synchronization scope"); 965 } 966 } 967 968 if (VMCnt || LGKMCnt) { 969 unsigned WaitCntImmediate = 970 AMDGPU::encodeWaitcnt(IV, 971 VMCnt ? 0 : getVmcntBitMask(IV), 972 getExpcntBitMask(IV), 973 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 974 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 975 Changed = true; 976 } 977 978 if (Pos == Position::AFTER) 979 --MI; 980 981 return Changed; 982 } 983 984 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 985 SIAtomicScope Scope, 986 SIAtomicAddrSpace AddrSpace, 987 Position Pos) const { 988 if (!InsertCacheInv) 989 return false; 990 991 bool Changed = false; 992 993 MachineBasicBlock &MBB = *MI->getParent(); 994 DebugLoc DL = MI->getDebugLoc(); 995 996 if (Pos == Position::AFTER) 997 ++MI; 998 999 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1000 switch (Scope) { 1001 case SIAtomicScope::SYSTEM: 1002 case SIAtomicScope::AGENT: 1003 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1004 Changed = true; 1005 break; 1006 case SIAtomicScope::WORKGROUP: 1007 case SIAtomicScope::WAVEFRONT: 1008 case SIAtomicScope::SINGLETHREAD: 1009 // No cache to invalidate. 1010 break; 1011 default: 1012 llvm_unreachable("Unsupported synchronization scope"); 1013 } 1014 } 1015 1016 /// The scratch address space does not need the global memory cache 1017 /// to be flushed as all memory operations by the same thread are 1018 /// sequentially consistent, and no other thread can access scratch 1019 /// memory. 1020 1021 /// Other address spaces do not have a cache. 1022 1023 if (Pos == Position::AFTER) 1024 --MI; 1025 1026 return Changed; 1027 } 1028 1029 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1030 SIAtomicScope Scope, 1031 SIAtomicAddrSpace AddrSpace, 1032 bool IsCrossAddrSpaceOrdering, 1033 Position Pos) const { 1034 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1035 IsCrossAddrSpaceOrdering, Pos); 1036 } 1037 1038 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1039 SIAtomicScope Scope, 1040 SIAtomicAddrSpace AddrSpace, 1041 Position Pos) const { 1042 if (!InsertCacheInv) 1043 return false; 1044 1045 bool Changed = false; 1046 1047 MachineBasicBlock &MBB = *MI->getParent(); 1048 DebugLoc DL = MI->getDebugLoc(); 1049 1050 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1051 1052 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1053 ? AMDGPU::BUFFER_WBINVL1 1054 : AMDGPU::BUFFER_WBINVL1_VOL; 1055 1056 if (Pos == Position::AFTER) 1057 ++MI; 1058 1059 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1060 switch (Scope) { 1061 case SIAtomicScope::SYSTEM: 1062 case SIAtomicScope::AGENT: 1063 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1064 Changed = true; 1065 break; 1066 case SIAtomicScope::WORKGROUP: 1067 case SIAtomicScope::WAVEFRONT: 1068 case SIAtomicScope::SINGLETHREAD: 1069 // No cache to invalidate. 1070 break; 1071 default: 1072 llvm_unreachable("Unsupported synchronization scope"); 1073 } 1074 } 1075 1076 /// The scratch address space does not need the global memory cache 1077 /// to be flushed as all memory operations by the same thread are 1078 /// sequentially consistent, and no other thread can access scratch 1079 /// memory. 1080 1081 /// Other address spaces do not have a cache. 1082 1083 if (Pos == Position::AFTER) 1084 --MI; 1085 1086 return Changed; 1087 } 1088 1089 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1090 const MachineBasicBlock::iterator &MI, 1091 SIAtomicScope Scope, 1092 SIAtomicAddrSpace AddrSpace) const { 1093 assert(MI->mayLoad() && !MI->mayStore()); 1094 bool Changed = false; 1095 1096 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1097 switch (Scope) { 1098 case SIAtomicScope::SYSTEM: 1099 case SIAtomicScope::AGENT: 1100 Changed |= enableGLCBit(MI); 1101 break; 1102 case SIAtomicScope::WORKGROUP: 1103 // In threadgroup split mode the waves of a work-group can be executing on 1104 // different CUs. Therefore need to bypass the L1 which is per CU. 1105 // Otherwise in non-threadgroup split mode all waves of a work-group are 1106 // on the same CU, and so the L1 does not need to be bypassed. 1107 if (ST.isTgSplitEnabled()) 1108 Changed |= enableGLCBit(MI); 1109 break; 1110 case SIAtomicScope::WAVEFRONT: 1111 case SIAtomicScope::SINGLETHREAD: 1112 // No cache to bypass. 1113 break; 1114 default: 1115 llvm_unreachable("Unsupported synchronization scope"); 1116 } 1117 } 1118 1119 /// The scratch address space does not need the global memory caches 1120 /// to be bypassed as all memory operations by the same thread are 1121 /// sequentially consistent, and no other thread can access scratch 1122 /// memory. 1123 1124 /// Other address spaces do not have a cache. 1125 1126 return Changed; 1127 } 1128 1129 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1130 const MachineBasicBlock::iterator &MI, 1131 SIAtomicScope Scope, 1132 SIAtomicAddrSpace AddrSpace) const { 1133 assert(!MI->mayLoad() && MI->mayStore()); 1134 bool Changed = false; 1135 1136 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1137 switch (Scope) { 1138 case SIAtomicScope::SYSTEM: 1139 case SIAtomicScope::AGENT: 1140 /// Do not set glc for store atomic operations as they implicitly write 1141 /// through the L1 cache. 1142 break; 1143 case SIAtomicScope::WORKGROUP: 1144 case SIAtomicScope::WAVEFRONT: 1145 case SIAtomicScope::SINGLETHREAD: 1146 // No cache to bypass. Store atomics implicitly write through the L1 1147 // cache. 1148 break; 1149 default: 1150 llvm_unreachable("Unsupported synchronization scope"); 1151 } 1152 } 1153 1154 /// The scratch address space does not need the global memory caches 1155 /// to be bypassed as all memory operations by the same thread are 1156 /// sequentially consistent, and no other thread can access scratch 1157 /// memory. 1158 1159 /// Other address spaces do not have a cache. 1160 1161 return Changed; 1162 } 1163 1164 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1165 const MachineBasicBlock::iterator &MI, 1166 SIAtomicScope Scope, 1167 SIAtomicAddrSpace AddrSpace) const { 1168 assert(MI->mayLoad() && MI->mayStore()); 1169 bool Changed = false; 1170 1171 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1172 switch (Scope) { 1173 case SIAtomicScope::SYSTEM: 1174 case SIAtomicScope::AGENT: 1175 /// Do not set glc for RMW atomic operations as they implicitly bypass 1176 /// the L1 cache, and the glc bit is instead used to indicate if they are 1177 /// return or no-return. 1178 break; 1179 case SIAtomicScope::WORKGROUP: 1180 case SIAtomicScope::WAVEFRONT: 1181 case SIAtomicScope::SINGLETHREAD: 1182 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1183 break; 1184 default: 1185 llvm_unreachable("Unsupported synchronization scope"); 1186 } 1187 } 1188 1189 return Changed; 1190 } 1191 1192 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1193 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1194 bool IsVolatile, bool IsNonTemporal) const { 1195 // Only handle load and store, not atomic read-modify-write insructions. The 1196 // latter use glc to indicate if the atomic returns a result and so must not 1197 // be used for cache control. 1198 assert(MI->mayLoad() ^ MI->mayStore()); 1199 1200 // Only update load and store, not LLVM IR atomic read-modify-write 1201 // instructions. The latter are always marked as volatile so cannot sensibly 1202 // handle it as do not want to pessimize all atomics. Also they do not support 1203 // the nontemporal attribute. 1204 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1205 1206 bool Changed = false; 1207 1208 if (IsVolatile) { 1209 if (Op == SIMemOp::LOAD) 1210 Changed |= enableGLCBit(MI); 1211 1212 // Ensure operation has completed at system scope to cause all volatile 1213 // operations to be visible outside the program in a global order. Do not 1214 // request cross address space as only the global address space can be 1215 // observable outside the program, so no need to cause a waitcnt for LDS 1216 // address space operations. 1217 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1218 Position::AFTER); 1219 1220 return Changed; 1221 } 1222 1223 if (IsNonTemporal) { 1224 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 1225 Changed |= enableGLCBit(MI); 1226 Changed |= enableSLCBit(MI); 1227 return Changed; 1228 } 1229 1230 return Changed; 1231 } 1232 1233 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1234 SIAtomicScope Scope, 1235 SIAtomicAddrSpace AddrSpace, 1236 SIMemOp Op, 1237 bool IsCrossAddrSpaceOrdering, 1238 Position Pos) const { 1239 if (ST.isTgSplitEnabled()) { 1240 // In threadgroup split mode the waves of a work-group can be executing on 1241 // different CUs. Therefore need to wait for global or GDS memory operations 1242 // to complete to ensure they are visible to waves in the other CUs. 1243 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1244 // the same CU, so no need to wait for global memory as all waves in the 1245 // work-group access the same the L1, nor wait for GDS as access are ordered 1246 // on a CU. 1247 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1248 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1249 (Scope == SIAtomicScope::WORKGROUP)) { 1250 // Same as GFX7 using agent scope. 1251 Scope = SIAtomicScope::AGENT; 1252 } 1253 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1254 // LDS memory operations. 1255 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1256 } 1257 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1258 IsCrossAddrSpaceOrdering, Pos); 1259 } 1260 1261 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1262 SIAtomicScope Scope, 1263 SIAtomicAddrSpace AddrSpace, 1264 Position Pos) const { 1265 if (!InsertCacheInv) 1266 return false; 1267 1268 bool Changed = false; 1269 1270 MachineBasicBlock &MBB = *MI->getParent(); 1271 DebugLoc DL = MI->getDebugLoc(); 1272 1273 if (Pos == Position::AFTER) 1274 ++MI; 1275 1276 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1277 switch (Scope) { 1278 case SIAtomicScope::SYSTEM: 1279 // Ensures that following loads will not see stale remote VMEM data or 1280 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1281 // CC will never be stale due to the local memory probes. 1282 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1283 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1284 // hardware does not reorder memory operations by the same wave with 1285 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1286 // remove any cache lines of earlier writes by the same wave and ensures 1287 // later reads by the same wave will refetch the cache lines. 1288 Changed = true; 1289 break; 1290 case SIAtomicScope::AGENT: 1291 // Same as GFX7. 1292 break; 1293 case SIAtomicScope::WORKGROUP: 1294 // In threadgroup split mode the waves of a work-group can be executing on 1295 // different CUs. Therefore need to invalidate the L1 which is per CU. 1296 // Otherwise in non-threadgroup split mode all waves of a work-group are 1297 // on the same CU, and so the L1 does not need to be invalidated. 1298 if (ST.isTgSplitEnabled()) { 1299 // Same as GFX7 using agent scope. 1300 Scope = SIAtomicScope::AGENT; 1301 } 1302 break; 1303 case SIAtomicScope::WAVEFRONT: 1304 case SIAtomicScope::SINGLETHREAD: 1305 // Same as GFX7. 1306 break; 1307 default: 1308 llvm_unreachable("Unsupported synchronization scope"); 1309 } 1310 } 1311 1312 /// The scratch address space does not need the global memory cache 1313 /// to be flushed as all memory operations by the same thread are 1314 /// sequentially consistent, and no other thread can access scratch 1315 /// memory. 1316 1317 /// Other address spaces do not have a cache. 1318 1319 if (Pos == Position::AFTER) 1320 --MI; 1321 1322 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1323 1324 return Changed; 1325 } 1326 1327 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1328 SIAtomicScope Scope, 1329 SIAtomicAddrSpace AddrSpace, 1330 bool IsCrossAddrSpaceOrdering, 1331 Position Pos) const { 1332 bool Changed = false; 1333 1334 MachineBasicBlock &MBB = *MI->getParent(); 1335 DebugLoc DL = MI->getDebugLoc(); 1336 1337 if (Pos == Position::AFTER) 1338 ++MI; 1339 1340 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1341 switch (Scope) { 1342 case SIAtomicScope::SYSTEM: 1343 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1344 // hardware does not reorder memory operations by the same wave with 1345 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1346 // to initiate writeback of any dirty cache lines of earlier writes by the 1347 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1348 // writeback has completed. 1349 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); 1350 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1351 // vmcnt(0)" needed by the "BUFFER_WBL2". 1352 Changed = true; 1353 break; 1354 case SIAtomicScope::AGENT: 1355 case SIAtomicScope::WORKGROUP: 1356 case SIAtomicScope::WAVEFRONT: 1357 case SIAtomicScope::SINGLETHREAD: 1358 // Same as GFX7. 1359 break; 1360 default: 1361 llvm_unreachable("Unsupported synchronization scope"); 1362 } 1363 } 1364 1365 if (Pos == Position::AFTER) 1366 --MI; 1367 1368 Changed |= 1369 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1370 IsCrossAddrSpaceOrdering, Pos); 1371 1372 return Changed; 1373 } 1374 1375 bool SIGfx10CacheControl::enableLoadCacheBypass( 1376 const MachineBasicBlock::iterator &MI, 1377 SIAtomicScope Scope, 1378 SIAtomicAddrSpace AddrSpace) const { 1379 assert(MI->mayLoad() && !MI->mayStore()); 1380 bool Changed = false; 1381 1382 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1383 /// TODO Do not set glc for rmw atomic operations as they 1384 /// implicitly bypass the L0/L1 caches. 1385 1386 switch (Scope) { 1387 case SIAtomicScope::SYSTEM: 1388 case SIAtomicScope::AGENT: 1389 Changed |= enableGLCBit(MI); 1390 Changed |= enableDLCBit(MI); 1391 break; 1392 case SIAtomicScope::WORKGROUP: 1393 // In WGP mode the waves of a work-group can be executing on either CU of 1394 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1395 // CU mode all waves of a work-group are on the same CU, and so the L0 1396 // does not need to be bypassed. 1397 if (!ST.isCuModeEnabled()) 1398 Changed |= enableGLCBit(MI); 1399 break; 1400 case SIAtomicScope::WAVEFRONT: 1401 case SIAtomicScope::SINGLETHREAD: 1402 // No cache to bypass. 1403 break; 1404 default: 1405 llvm_unreachable("Unsupported synchronization scope"); 1406 } 1407 } 1408 1409 /// The scratch address space does not need the global memory caches 1410 /// to be bypassed as all memory operations by the same thread are 1411 /// sequentially consistent, and no other thread can access scratch 1412 /// memory. 1413 1414 /// Other address spaces do not have a cache. 1415 1416 return Changed; 1417 } 1418 1419 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1420 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1421 bool IsVolatile, bool IsNonTemporal) const { 1422 1423 // Only handle load and store, not atomic read-modify-write insructions. The 1424 // latter use glc to indicate if the atomic returns a result and so must not 1425 // be used for cache control. 1426 assert(MI->mayLoad() ^ MI->mayStore()); 1427 1428 // Only update load and store, not LLVM IR atomic read-modify-write 1429 // instructions. The latter are always marked as volatile so cannot sensibly 1430 // handle it as do not want to pessimize all atomics. Also they do not support 1431 // the nontemporal attribute. 1432 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1433 1434 bool Changed = false; 1435 1436 if (IsVolatile) { 1437 if (Op == SIMemOp::LOAD) { 1438 Changed |= enableGLCBit(MI); 1439 Changed |= enableDLCBit(MI); 1440 } 1441 1442 // Ensure operation has completed at system scope to cause all volatile 1443 // operations to be visible outside the program in a global order. Do not 1444 // request cross address space as only the global address space can be 1445 // observable outside the program, so no need to cause a waitcnt for LDS 1446 // address space operations. 1447 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1448 Position::AFTER); 1449 return Changed; 1450 } 1451 1452 if (IsNonTemporal) { 1453 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1454 Changed |= enableSLCBit(MI); 1455 return Changed; 1456 } 1457 1458 return Changed; 1459 } 1460 1461 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1462 SIAtomicScope Scope, 1463 SIAtomicAddrSpace AddrSpace, 1464 SIMemOp Op, 1465 bool IsCrossAddrSpaceOrdering, 1466 Position Pos) const { 1467 bool Changed = false; 1468 1469 MachineBasicBlock &MBB = *MI->getParent(); 1470 DebugLoc DL = MI->getDebugLoc(); 1471 1472 if (Pos == Position::AFTER) 1473 ++MI; 1474 1475 bool VMCnt = false; 1476 bool VSCnt = false; 1477 bool LGKMCnt = false; 1478 1479 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1480 SIAtomicAddrSpace::NONE) { 1481 switch (Scope) { 1482 case SIAtomicScope::SYSTEM: 1483 case SIAtomicScope::AGENT: 1484 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1485 VMCnt |= true; 1486 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1487 VSCnt |= true; 1488 break; 1489 case SIAtomicScope::WORKGROUP: 1490 // In WGP mode the waves of a work-group can be executing on either CU of 1491 // the WGP. Therefore need to wait for operations to complete to ensure 1492 // they are visible to waves in the other CU as the L0 is per CU. 1493 // Otherwise in CU mode and all waves of a work-group are on the same CU 1494 // which shares the same L0. 1495 if (!ST.isCuModeEnabled()) { 1496 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1497 VMCnt |= true; 1498 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1499 VSCnt |= true; 1500 } 1501 break; 1502 case SIAtomicScope::WAVEFRONT: 1503 case SIAtomicScope::SINGLETHREAD: 1504 // The L0 cache keeps all memory operations in order for 1505 // work-items in the same wavefront. 1506 break; 1507 default: 1508 llvm_unreachable("Unsupported synchronization scope"); 1509 } 1510 } 1511 1512 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1513 switch (Scope) { 1514 case SIAtomicScope::SYSTEM: 1515 case SIAtomicScope::AGENT: 1516 case SIAtomicScope::WORKGROUP: 1517 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1518 // not needed as LDS operations for all waves are executed in a total 1519 // global ordering as observed by all waves. Required if also 1520 // synchronizing with global/GDS memory as LDS operations could be 1521 // reordered with respect to later global/GDS memory operations of the 1522 // same wave. 1523 LGKMCnt |= IsCrossAddrSpaceOrdering; 1524 break; 1525 case SIAtomicScope::WAVEFRONT: 1526 case SIAtomicScope::SINGLETHREAD: 1527 // The LDS keeps all memory operations in order for 1528 // the same wavesfront. 1529 break; 1530 default: 1531 llvm_unreachable("Unsupported synchronization scope"); 1532 } 1533 } 1534 1535 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1536 switch (Scope) { 1537 case SIAtomicScope::SYSTEM: 1538 case SIAtomicScope::AGENT: 1539 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1540 // is not needed as GDS operations for all waves are executed in a total 1541 // global ordering as observed by all waves. Required if also 1542 // synchronizing with global/LDS memory as GDS operations could be 1543 // reordered with respect to later global/LDS memory operations of the 1544 // same wave. 1545 LGKMCnt |= IsCrossAddrSpaceOrdering; 1546 break; 1547 case SIAtomicScope::WORKGROUP: 1548 case SIAtomicScope::WAVEFRONT: 1549 case SIAtomicScope::SINGLETHREAD: 1550 // The GDS keeps all memory operations in order for 1551 // the same work-group. 1552 break; 1553 default: 1554 llvm_unreachable("Unsupported synchronization scope"); 1555 } 1556 } 1557 1558 if (VMCnt || LGKMCnt) { 1559 unsigned WaitCntImmediate = 1560 AMDGPU::encodeWaitcnt(IV, 1561 VMCnt ? 0 : getVmcntBitMask(IV), 1562 getExpcntBitMask(IV), 1563 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1564 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1565 Changed = true; 1566 } 1567 1568 if (VSCnt) { 1569 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1570 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1571 .addImm(0); 1572 Changed = true; 1573 } 1574 1575 if (Pos == Position::AFTER) 1576 --MI; 1577 1578 return Changed; 1579 } 1580 1581 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1582 SIAtomicScope Scope, 1583 SIAtomicAddrSpace AddrSpace, 1584 Position Pos) const { 1585 if (!InsertCacheInv) 1586 return false; 1587 1588 bool Changed = false; 1589 1590 MachineBasicBlock &MBB = *MI->getParent(); 1591 DebugLoc DL = MI->getDebugLoc(); 1592 1593 if (Pos == Position::AFTER) 1594 ++MI; 1595 1596 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1597 switch (Scope) { 1598 case SIAtomicScope::SYSTEM: 1599 case SIAtomicScope::AGENT: 1600 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1601 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1602 Changed = true; 1603 break; 1604 case SIAtomicScope::WORKGROUP: 1605 // In WGP mode the waves of a work-group can be executing on either CU of 1606 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1607 // in CU mode and all waves of a work-group are on the same CU, and so the 1608 // L0 does not need to be invalidated. 1609 if (!ST.isCuModeEnabled()) { 1610 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1611 Changed = true; 1612 } 1613 break; 1614 case SIAtomicScope::WAVEFRONT: 1615 case SIAtomicScope::SINGLETHREAD: 1616 // No cache to invalidate. 1617 break; 1618 default: 1619 llvm_unreachable("Unsupported synchronization scope"); 1620 } 1621 } 1622 1623 /// The scratch address space does not need the global memory cache 1624 /// to be flushed as all memory operations by the same thread are 1625 /// sequentially consistent, and no other thread can access scratch 1626 /// memory. 1627 1628 /// Other address spaces do not have a cache. 1629 1630 if (Pos == Position::AFTER) 1631 --MI; 1632 1633 return Changed; 1634 } 1635 1636 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1637 if (AtomicPseudoMIs.empty()) 1638 return false; 1639 1640 for (auto &MI : AtomicPseudoMIs) 1641 MI->eraseFromParent(); 1642 1643 AtomicPseudoMIs.clear(); 1644 return true; 1645 } 1646 1647 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1648 MachineBasicBlock::iterator &MI) { 1649 assert(MI->mayLoad() && !MI->mayStore()); 1650 1651 bool Changed = false; 1652 1653 if (MOI.isAtomic()) { 1654 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1655 MOI.getOrdering() == AtomicOrdering::Acquire || 1656 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1657 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1658 MOI.getOrderingAddrSpace()); 1659 } 1660 1661 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1662 Changed |= CC->insertWait(MI, MOI.getScope(), 1663 MOI.getOrderingAddrSpace(), 1664 SIMemOp::LOAD | SIMemOp::STORE, 1665 MOI.getIsCrossAddressSpaceOrdering(), 1666 Position::BEFORE); 1667 1668 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1669 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1670 Changed |= CC->insertWait(MI, MOI.getScope(), 1671 MOI.getInstrAddrSpace(), 1672 SIMemOp::LOAD, 1673 MOI.getIsCrossAddressSpaceOrdering(), 1674 Position::AFTER); 1675 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1676 MOI.getOrderingAddrSpace(), 1677 Position::AFTER); 1678 } 1679 1680 return Changed; 1681 } 1682 1683 // Atomic instructions already bypass caches to the scope specified by the 1684 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1685 // need additional treatment. 1686 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1687 SIMemOp::LOAD, MOI.isVolatile(), 1688 MOI.isNonTemporal()); 1689 return Changed; 1690 } 1691 1692 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1693 MachineBasicBlock::iterator &MI) { 1694 assert(!MI->mayLoad() && MI->mayStore()); 1695 1696 bool Changed = false; 1697 1698 if (MOI.isAtomic()) { 1699 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1700 MOI.getOrdering() == AtomicOrdering::Release || 1701 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1702 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1703 MOI.getOrderingAddrSpace()); 1704 } 1705 1706 if (MOI.getOrdering() == AtomicOrdering::Release || 1707 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1708 Changed |= CC->insertRelease(MI, MOI.getScope(), 1709 MOI.getOrderingAddrSpace(), 1710 MOI.getIsCrossAddressSpaceOrdering(), 1711 Position::BEFORE); 1712 1713 return Changed; 1714 } 1715 1716 // Atomic instructions already bypass caches to the scope specified by the 1717 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1718 // need additional treatment. 1719 Changed |= CC->enableVolatileAndOrNonTemporal( 1720 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1721 MOI.isNonTemporal()); 1722 return Changed; 1723 } 1724 1725 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1726 MachineBasicBlock::iterator &MI) { 1727 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1728 1729 AtomicPseudoMIs.push_back(MI); 1730 bool Changed = false; 1731 1732 if (MOI.isAtomic()) { 1733 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1734 MOI.getOrdering() == AtomicOrdering::Release || 1735 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1736 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1737 /// TODO: This relies on a barrier always generating a waitcnt 1738 /// for LDS to ensure it is not reordered with the completion of 1739 /// the proceeding LDS operations. If barrier had a memory 1740 /// ordering and memory scope, then library does not need to 1741 /// generate a fence. Could add support in this file for 1742 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1743 /// adding S_WAITCNT before a S_BARRIER. 1744 Changed |= CC->insertRelease(MI, MOI.getScope(), 1745 MOI.getOrderingAddrSpace(), 1746 MOI.getIsCrossAddressSpaceOrdering(), 1747 Position::BEFORE); 1748 1749 // TODO: If both release and invalidate are happening they could be combined 1750 // to use the single "BUFFER_WBINV*" instruction. This could be done by 1751 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1752 // track cache invalidate and write back instructions. 1753 1754 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1755 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1756 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1757 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1758 MOI.getOrderingAddrSpace(), 1759 Position::BEFORE); 1760 1761 return Changed; 1762 } 1763 1764 return Changed; 1765 } 1766 1767 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1768 MachineBasicBlock::iterator &MI) { 1769 assert(MI->mayLoad() && MI->mayStore()); 1770 1771 bool Changed = false; 1772 1773 if (MOI.isAtomic()) { 1774 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1775 MOI.getOrdering() == AtomicOrdering::Acquire || 1776 MOI.getOrdering() == AtomicOrdering::Release || 1777 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1778 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1779 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1780 MOI.getInstrAddrSpace()); 1781 } 1782 1783 if (MOI.getOrdering() == AtomicOrdering::Release || 1784 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1785 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1786 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1787 Changed |= CC->insertRelease(MI, MOI.getScope(), 1788 MOI.getOrderingAddrSpace(), 1789 MOI.getIsCrossAddressSpaceOrdering(), 1790 Position::BEFORE); 1791 1792 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1793 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1794 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1795 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1796 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1797 Changed |= CC->insertWait(MI, MOI.getScope(), 1798 MOI.getInstrAddrSpace(), 1799 isAtomicRet(*MI) ? SIMemOp::LOAD : 1800 SIMemOp::STORE, 1801 MOI.getIsCrossAddressSpaceOrdering(), 1802 Position::AFTER); 1803 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1804 MOI.getOrderingAddrSpace(), 1805 Position::AFTER); 1806 } 1807 1808 return Changed; 1809 } 1810 1811 return Changed; 1812 } 1813 1814 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1815 bool Changed = false; 1816 1817 SIMemOpAccess MOA(MF); 1818 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1819 1820 for (auto &MBB : MF) { 1821 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1822 1823 // Unbundle instructions after the post-RA scheduler. 1824 if (MI->isBundle() && MI->mayLoadOrStore()) { 1825 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1826 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1827 I != E && I->isBundledWithPred(); ++I) { 1828 I->unbundleFromPred(); 1829 for (MachineOperand &MO : I->operands()) 1830 if (MO.isReg()) 1831 MO.setIsInternalRead(false); 1832 } 1833 1834 MI->eraseFromParent(); 1835 MI = II->getIterator(); 1836 } 1837 1838 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1839 continue; 1840 1841 if (const auto &MOI = MOA.getLoadInfo(MI)) 1842 Changed |= expandLoad(MOI.getValue(), MI); 1843 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1844 Changed |= expandStore(MOI.getValue(), MI); 1845 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1846 Changed |= expandAtomicFence(MOI.getValue(), MI); 1847 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1848 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1849 } 1850 } 1851 1852 Changed |= removeAtomicPseudoMIs(); 1853 return Changed; 1854 } 1855 1856 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1857 1858 char SIMemoryLegalizer::ID = 0; 1859 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1860 1861 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1862 return new SIMemoryLegalizer(); 1863 } 1864