1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIDefines.h" 20 #include "SIInstrInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/None.h" 25 #include "llvm/ADT/Optional.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineMemOperand.h" 31 #include "llvm/CodeGen/MachineModuleInfo.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/LLVMContext.h" 37 #include "llvm/MC/MCInstrDesc.h" 38 #include "llvm/Pass.h" 39 #include "llvm/Support/AtomicOrdering.h" 40 #include "llvm/Support/MathExtras.h" 41 #include <cassert> 42 #include <list> 43 44 using namespace llvm; 45 using namespace llvm::AMDGPU; 46 47 #define DEBUG_TYPE "si-memory-legalizer" 48 #define PASS_NAME "SI Memory Legalizer" 49 50 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 51 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 52 cl::desc("Use this to skip inserting cache invalidating instructions.")); 53 54 namespace { 55 56 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 57 58 /// Memory operation flags. Can be ORed together. 59 enum class SIMemOp { 60 NONE = 0u, 61 LOAD = 1u << 0, 62 STORE = 1u << 1, 63 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 64 }; 65 66 /// Position to insert a new instruction relative to an existing 67 /// instruction. 68 enum class Position { 69 BEFORE, 70 AFTER 71 }; 72 73 /// The atomic synchronization scopes supported by the AMDGPU target. 74 enum class SIAtomicScope { 75 NONE, 76 SINGLETHREAD, 77 WAVEFRONT, 78 WORKGROUP, 79 AGENT, 80 SYSTEM 81 }; 82 83 /// The distinct address spaces supported by the AMDGPU target for 84 /// atomic memory operation. Can be ORed toether. 85 enum class SIAtomicAddrSpace { 86 NONE = 0u, 87 GLOBAL = 1u << 0, 88 LDS = 1u << 1, 89 SCRATCH = 1u << 2, 90 GDS = 1u << 3, 91 OTHER = 1u << 4, 92 93 /// The address spaces that can be accessed by a FLAT instruction. 94 FLAT = GLOBAL | LDS | SCRATCH, 95 96 /// The address spaces that support atomic instructions. 97 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 98 99 /// All address spaces. 100 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 101 102 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 103 }; 104 105 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 106 /// \returns Returns true if \p MI is modified, false otherwise. 107 template <uint16_t BitName> 108 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 109 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 110 if (BitIdx == -1) 111 return false; 112 113 MachineOperand &Bit = MI->getOperand(BitIdx); 114 if (Bit.getImm() != 0) 115 return false; 116 117 Bit.setImm(1); 118 return true; 119 } 120 121 class SIMemOpInfo final { 122 private: 123 124 friend class SIMemOpAccess; 125 126 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 127 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 128 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 129 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 130 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 131 bool IsCrossAddressSpaceOrdering = false; 132 bool IsNonTemporal = false; 133 134 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 135 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 136 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 137 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 138 bool IsCrossAddressSpaceOrdering = true, 139 AtomicOrdering FailureOrdering = 140 AtomicOrdering::SequentiallyConsistent, 141 bool IsNonTemporal = false) 142 : Ordering(Ordering), FailureOrdering(FailureOrdering), 143 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 144 InstrAddrSpace(InstrAddrSpace), 145 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 146 IsNonTemporal(IsNonTemporal) { 147 // There is also no cross address space ordering if the ordering 148 // address space is the same as the instruction address space and 149 // only contains a single address space. 150 if ((OrderingAddrSpace == InstrAddrSpace) && 151 isPowerOf2_32(uint32_t(InstrAddrSpace))) 152 this->IsCrossAddressSpaceOrdering = false; 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is non-temporal, false otherwise. 194 bool isNonTemporal() const { 195 return IsNonTemporal; 196 } 197 198 /// \returns True if ordering constraint of the machine instruction used to 199 /// create this SIMemOpInfo is unordered or higher, false otherwise. 200 bool isAtomic() const { 201 return Ordering != AtomicOrdering::NotAtomic; 202 } 203 204 }; 205 206 class SIMemOpAccess final { 207 private: 208 AMDGPUMachineModuleInfo *MMI = nullptr; 209 210 /// Reports unsupported message \p Msg for \p MI to LLVM context. 211 void reportUnsupported(const MachineBasicBlock::iterator &MI, 212 const char *Msg) const; 213 214 /// Inspects the target synchonization scope \p SSID and determines 215 /// the SI atomic scope it corresponds to, the address spaces it 216 /// covers, and whether the memory ordering applies between address 217 /// spaces. 218 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 219 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 220 221 /// \return Return a bit set of the address spaces accessed by \p AS. 222 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 223 224 /// \returns Info constructed from \p MI, which has at least machine memory 225 /// operand. 226 Optional<SIMemOpInfo> constructFromMIWithMMO( 227 const MachineBasicBlock::iterator &MI) const; 228 229 public: 230 /// Construct class to support accessing the machine memory operands 231 /// of instructions in the machine function \p MF. 232 SIMemOpAccess(MachineFunction &MF); 233 234 /// \returns Load info if \p MI is a load operation, "None" otherwise. 235 Optional<SIMemOpInfo> getLoadInfo( 236 const MachineBasicBlock::iterator &MI) const; 237 238 /// \returns Store info if \p MI is a store operation, "None" otherwise. 239 Optional<SIMemOpInfo> getStoreInfo( 240 const MachineBasicBlock::iterator &MI) const; 241 242 /// \returns Atomic fence info if \p MI is an atomic fence operation, 243 /// "None" otherwise. 244 Optional<SIMemOpInfo> getAtomicFenceInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 248 /// rmw operation, "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 }; 252 253 class SICacheControl { 254 protected: 255 256 /// AMDGPU subtarget info. 257 const GCNSubtarget &ST; 258 259 /// Instruction info. 260 const SIInstrInfo *TII = nullptr; 261 262 IsaVersion IV; 263 264 /// Whether to insert cache invalidating instructions. 265 bool InsertCacheInv; 266 267 SICacheControl(const GCNSubtarget &ST); 268 269 public: 270 271 /// Create a cache control for the subtarget \p ST. 272 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 273 274 /// Update \p MI memory load instruction to bypass any caches up to 275 /// the \p Scope memory scope for address spaces \p 276 /// AddrSpace. Return true iff the instruction was modified. 277 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 278 SIAtomicScope Scope, 279 SIAtomicAddrSpace AddrSpace) const = 0; 280 281 /// Update \p MI memory instruction to indicate it is 282 /// nontemporal. Return true iff the instruction was modified. 283 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) 284 const = 0; 285 286 /// Inserts any necessary instructions at position \p Pos relative to 287 /// instruction \p MI to ensure any subsequent memory instructions of this 288 /// thread with address spaces \p AddrSpace will observe the previous memory 289 /// operations by any thread for memory scopes up to memory scope \p Scope . 290 /// Returns true iff any instructions inserted. 291 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 292 SIAtomicScope Scope, 293 SIAtomicAddrSpace AddrSpace, 294 Position Pos) const = 0; 295 296 /// Inserts any necessary instructions at position \p Pos relative 297 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 298 /// \p Op associated with address spaces \p AddrSpace have completed. Used 299 /// between memory instructions to enforce the order they become visible as 300 /// observed by other memory instructions executing in memory scope \p Scope. 301 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 302 /// address spaces. Returns true iff any instructions inserted. 303 virtual bool insertWait(MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace, 306 SIMemOp Op, 307 bool IsCrossAddrSpaceOrdering, 308 Position Pos) const = 0; 309 310 /// Inserts any necessary instructions at position \p Pos relative to 311 /// instruction \p MI to ensure previous memory instructions by this thread 312 /// with address spaces \p AddrSpace have completed and can be observed by 313 /// subsequent memory instructions by any thread executing in memory scope \p 314 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 315 /// between address spaces. Returns true iff any instructions inserted. 316 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 317 SIAtomicScope Scope, 318 SIAtomicAddrSpace AddrSpace, 319 bool IsCrossAddrSpaceOrdering, 320 Position Pos) const = 0; 321 322 /// Virtual destructor to allow derivations to be deleted. 323 virtual ~SICacheControl() = default; 324 325 }; 326 327 class SIGfx6CacheControl : public SICacheControl { 328 protected: 329 330 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 331 /// is modified, false otherwise. 332 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 333 return enableNamedBit<AMDGPU::OpName::glc>(MI); 334 } 335 336 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 337 /// is modified, false otherwise. 338 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 339 return enableNamedBit<AMDGPU::OpName::slc>(MI); 340 } 341 342 public: 343 344 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 345 346 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 347 SIAtomicScope Scope, 348 SIAtomicAddrSpace AddrSpace) const override; 349 350 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 351 352 bool insertAcquire(MachineBasicBlock::iterator &MI, 353 SIAtomicScope Scope, 354 SIAtomicAddrSpace AddrSpace, 355 Position Pos) const override; 356 357 bool insertRelease(MachineBasicBlock::iterator &MI, 358 SIAtomicScope Scope, 359 SIAtomicAddrSpace AddrSpace, 360 bool IsCrossAddrSpaceOrdering, 361 Position Pos) const override; 362 363 bool insertWait(MachineBasicBlock::iterator &MI, 364 SIAtomicScope Scope, 365 SIAtomicAddrSpace AddrSpace, 366 SIMemOp Op, 367 bool IsCrossAddrSpaceOrdering, 368 Position Pos) const override; 369 }; 370 371 class SIGfx7CacheControl : public SIGfx6CacheControl { 372 public: 373 374 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 375 376 bool insertAcquire(MachineBasicBlock::iterator &MI, 377 SIAtomicScope Scope, 378 SIAtomicAddrSpace AddrSpace, 379 Position Pos) const override; 380 381 }; 382 383 class SIGfx10CacheControl : public SIGfx7CacheControl { 384 protected: 385 386 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 387 /// is modified, false otherwise. 388 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 389 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 390 } 391 392 public: 393 394 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 395 396 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace) const override; 399 400 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 401 402 bool insertAcquire(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 Position Pos) const override; 406 407 bool insertWait(MachineBasicBlock::iterator &MI, 408 SIAtomicScope Scope, 409 SIAtomicAddrSpace AddrSpace, 410 SIMemOp Op, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 }; 414 415 class SIMemoryLegalizer final : public MachineFunctionPass { 416 private: 417 418 /// Cache Control. 419 std::unique_ptr<SICacheControl> CC = nullptr; 420 421 /// List of atomic pseudo instructions. 422 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 423 424 /// Return true iff instruction \p MI is a atomic instruction that 425 /// returns a result. 426 bool isAtomicRet(const MachineInstr &MI) const { 427 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 428 } 429 430 /// Removes all processed atomic pseudo instructions from the current 431 /// function. Returns true if current function is modified, false otherwise. 432 bool removeAtomicPseudoMIs(); 433 434 /// Expands load operation \p MI. Returns true if instructions are 435 /// added/deleted or \p MI is modified, false otherwise. 436 bool expandLoad(const SIMemOpInfo &MOI, 437 MachineBasicBlock::iterator &MI); 438 /// Expands store operation \p MI. Returns true if instructions are 439 /// added/deleted or \p MI is modified, false otherwise. 440 bool expandStore(const SIMemOpInfo &MOI, 441 MachineBasicBlock::iterator &MI); 442 /// Expands atomic fence operation \p MI. Returns true if 443 /// instructions are added/deleted or \p MI is modified, false otherwise. 444 bool expandAtomicFence(const SIMemOpInfo &MOI, 445 MachineBasicBlock::iterator &MI); 446 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 447 /// instructions are added/deleted or \p MI is modified, false otherwise. 448 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 449 MachineBasicBlock::iterator &MI); 450 451 public: 452 static char ID; 453 454 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 455 456 void getAnalysisUsage(AnalysisUsage &AU) const override { 457 AU.setPreservesCFG(); 458 MachineFunctionPass::getAnalysisUsage(AU); 459 } 460 461 StringRef getPassName() const override { 462 return PASS_NAME; 463 } 464 465 bool runOnMachineFunction(MachineFunction &MF) override; 466 }; 467 468 } // end namespace anonymous 469 470 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 471 const char *Msg) const { 472 const Function &Func = MI->getParent()->getParent()->getFunction(); 473 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 474 Func.getContext().diagnose(Diag); 475 } 476 477 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 478 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 479 SIAtomicAddrSpace InstrScope) const { 480 if (SSID == SyncScope::System) 481 return std::make_tuple(SIAtomicScope::SYSTEM, 482 SIAtomicAddrSpace::ATOMIC, 483 true); 484 if (SSID == MMI->getAgentSSID()) 485 return std::make_tuple(SIAtomicScope::AGENT, 486 SIAtomicAddrSpace::ATOMIC, 487 true); 488 if (SSID == MMI->getWorkgroupSSID()) 489 return std::make_tuple(SIAtomicScope::WORKGROUP, 490 SIAtomicAddrSpace::ATOMIC, 491 true); 492 if (SSID == MMI->getWavefrontSSID()) 493 return std::make_tuple(SIAtomicScope::WAVEFRONT, 494 SIAtomicAddrSpace::ATOMIC, 495 true); 496 if (SSID == SyncScope::SingleThread) 497 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 498 SIAtomicAddrSpace::ATOMIC, 499 true); 500 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 501 return std::make_tuple(SIAtomicScope::SYSTEM, 502 SIAtomicAddrSpace::ATOMIC & InstrScope, 503 false); 504 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 505 return std::make_tuple(SIAtomicScope::AGENT, 506 SIAtomicAddrSpace::ATOMIC & InstrScope, 507 false); 508 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 509 return std::make_tuple(SIAtomicScope::WORKGROUP, 510 SIAtomicAddrSpace::ATOMIC & InstrScope, 511 false); 512 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 513 return std::make_tuple(SIAtomicScope::WAVEFRONT, 514 SIAtomicAddrSpace::ATOMIC & InstrScope, 515 false); 516 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 517 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 518 SIAtomicAddrSpace::ATOMIC & InstrScope, 519 false); 520 return None; 521 } 522 523 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 524 if (AS == AMDGPUAS::FLAT_ADDRESS) 525 return SIAtomicAddrSpace::FLAT; 526 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 527 return SIAtomicAddrSpace::GLOBAL; 528 if (AS == AMDGPUAS::LOCAL_ADDRESS) 529 return SIAtomicAddrSpace::LDS; 530 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 531 return SIAtomicAddrSpace::SCRATCH; 532 if (AS == AMDGPUAS::REGION_ADDRESS) 533 return SIAtomicAddrSpace::GDS; 534 535 return SIAtomicAddrSpace::OTHER; 536 } 537 538 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 539 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 540 } 541 542 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 543 const MachineBasicBlock::iterator &MI) const { 544 assert(MI->getNumMemOperands() > 0); 545 546 SyncScope::ID SSID = SyncScope::SingleThread; 547 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 548 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 549 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 550 bool IsNonTemporal = true; 551 552 // Validator should check whether or not MMOs cover the entire set of 553 // locations accessed by the memory instruction. 554 for (const auto &MMO : MI->memoperands()) { 555 IsNonTemporal &= MMO->isNonTemporal(); 556 InstrAddrSpace |= 557 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 558 AtomicOrdering OpOrdering = MMO->getOrdering(); 559 if (OpOrdering != AtomicOrdering::NotAtomic) { 560 const auto &IsSyncScopeInclusion = 561 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 562 if (!IsSyncScopeInclusion) { 563 reportUnsupported(MI, 564 "Unsupported non-inclusive atomic synchronization scope"); 565 return None; 566 } 567 568 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 569 Ordering = 570 isStrongerThan(Ordering, OpOrdering) ? 571 Ordering : MMO->getOrdering(); 572 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 573 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 574 FailureOrdering = 575 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 576 FailureOrdering : MMO->getFailureOrdering(); 577 } 578 } 579 580 SIAtomicScope Scope = SIAtomicScope::NONE; 581 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 582 bool IsCrossAddressSpaceOrdering = false; 583 if (Ordering != AtomicOrdering::NotAtomic) { 584 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 585 if (!ScopeOrNone) { 586 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 587 return None; 588 } 589 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 590 ScopeOrNone.getValue(); 591 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 592 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 593 reportUnsupported(MI, "Unsupported atomic address space"); 594 return None; 595 } 596 } 597 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 598 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); 599 } 600 601 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 602 const MachineBasicBlock::iterator &MI) const { 603 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 604 605 if (!(MI->mayLoad() && !MI->mayStore())) 606 return None; 607 608 // Be conservative if there are no memory operands. 609 if (MI->getNumMemOperands() == 0) 610 return SIMemOpInfo(); 611 612 return constructFromMIWithMMO(MI); 613 } 614 615 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 616 const MachineBasicBlock::iterator &MI) const { 617 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 618 619 if (!(!MI->mayLoad() && MI->mayStore())) 620 return None; 621 622 // Be conservative if there are no memory operands. 623 if (MI->getNumMemOperands() == 0) 624 return SIMemOpInfo(); 625 626 return constructFromMIWithMMO(MI); 627 } 628 629 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 630 const MachineBasicBlock::iterator &MI) const { 631 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 632 633 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 634 return None; 635 636 AtomicOrdering Ordering = 637 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 638 639 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 640 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 641 if (!ScopeOrNone) { 642 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 643 return None; 644 } 645 646 SIAtomicScope Scope = SIAtomicScope::NONE; 647 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 648 bool IsCrossAddressSpaceOrdering = false; 649 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 650 ScopeOrNone.getValue(); 651 652 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 653 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 654 reportUnsupported(MI, "Unsupported atomic address space"); 655 return None; 656 } 657 658 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 659 IsCrossAddressSpaceOrdering); 660 } 661 662 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 663 const MachineBasicBlock::iterator &MI) const { 664 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 665 666 if (!(MI->mayLoad() && MI->mayStore())) 667 return None; 668 669 // Be conservative if there are no memory operands. 670 if (MI->getNumMemOperands() == 0) 671 return SIMemOpInfo(); 672 673 return constructFromMIWithMMO(MI); 674 } 675 676 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 677 TII = ST.getInstrInfo(); 678 IV = getIsaVersion(ST.getCPU()); 679 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 680 } 681 682 /* static */ 683 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 684 GCNSubtarget::Generation Generation = ST.getGeneration(); 685 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 686 return std::make_unique<SIGfx6CacheControl>(ST); 687 if (Generation < AMDGPUSubtarget::GFX10) 688 return std::make_unique<SIGfx7CacheControl>(ST); 689 return std::make_unique<SIGfx10CacheControl>(ST); 690 } 691 692 bool SIGfx6CacheControl::enableLoadCacheBypass( 693 const MachineBasicBlock::iterator &MI, 694 SIAtomicScope Scope, 695 SIAtomicAddrSpace AddrSpace) const { 696 assert(MI->mayLoad() && !MI->mayStore()); 697 bool Changed = false; 698 699 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 700 switch (Scope) { 701 case SIAtomicScope::SYSTEM: 702 case SIAtomicScope::AGENT: 703 Changed |= enableGLCBit(MI); 704 break; 705 case SIAtomicScope::WORKGROUP: 706 case SIAtomicScope::WAVEFRONT: 707 case SIAtomicScope::SINGLETHREAD: 708 // No cache to bypass. 709 break; 710 default: 711 llvm_unreachable("Unsupported synchronization scope"); 712 } 713 } 714 715 /// The scratch address space does not need the global memory caches 716 /// to be bypassed as all memory operations by the same thread are 717 /// sequentially consistent, and no other thread can access scratch 718 /// memory. 719 720 /// Other address spaces do not have a cache. 721 722 return Changed; 723 } 724 725 bool SIGfx6CacheControl::enableNonTemporal( 726 const MachineBasicBlock::iterator &MI) const { 727 assert(MI->mayLoad() ^ MI->mayStore()); 728 bool Changed = false; 729 730 /// TODO: Do not enableGLCBit if rmw atomic. 731 Changed |= enableGLCBit(MI); 732 Changed |= enableSLCBit(MI); 733 734 return Changed; 735 } 736 737 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 738 SIAtomicScope Scope, 739 SIAtomicAddrSpace AddrSpace, 740 Position Pos) const { 741 if (!InsertCacheInv) 742 return false; 743 744 bool Changed = false; 745 746 MachineBasicBlock &MBB = *MI->getParent(); 747 DebugLoc DL = MI->getDebugLoc(); 748 749 if (Pos == Position::AFTER) 750 ++MI; 751 752 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 753 switch (Scope) { 754 case SIAtomicScope::SYSTEM: 755 case SIAtomicScope::AGENT: 756 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 757 Changed = true; 758 break; 759 case SIAtomicScope::WORKGROUP: 760 case SIAtomicScope::WAVEFRONT: 761 case SIAtomicScope::SINGLETHREAD: 762 // No cache to invalidate. 763 break; 764 default: 765 llvm_unreachable("Unsupported synchronization scope"); 766 } 767 } 768 769 /// The scratch address space does not need the global memory cache 770 /// to be flushed as all memory operations by the same thread are 771 /// sequentially consistent, and no other thread can access scratch 772 /// memory. 773 774 /// Other address spaces do not have a cache. 775 776 if (Pos == Position::AFTER) 777 --MI; 778 779 return Changed; 780 } 781 782 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 783 SIAtomicScope Scope, 784 SIAtomicAddrSpace AddrSpace, 785 SIMemOp Op, 786 bool IsCrossAddrSpaceOrdering, 787 Position Pos) const { 788 bool Changed = false; 789 790 MachineBasicBlock &MBB = *MI->getParent(); 791 DebugLoc DL = MI->getDebugLoc(); 792 793 if (Pos == Position::AFTER) 794 ++MI; 795 796 bool VMCnt = false; 797 bool LGKMCnt = false; 798 799 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 800 switch (Scope) { 801 case SIAtomicScope::SYSTEM: 802 case SIAtomicScope::AGENT: 803 VMCnt |= true; 804 break; 805 case SIAtomicScope::WORKGROUP: 806 case SIAtomicScope::WAVEFRONT: 807 case SIAtomicScope::SINGLETHREAD: 808 // The L1 cache keeps all memory operations in order for 809 // wavefronts in the same work-group. 810 break; 811 default: 812 llvm_unreachable("Unsupported synchronization scope"); 813 } 814 } 815 816 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 817 switch (Scope) { 818 case SIAtomicScope::SYSTEM: 819 case SIAtomicScope::AGENT: 820 case SIAtomicScope::WORKGROUP: 821 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 822 // not needed as LDS operations for all waves are executed in a total 823 // global ordering as observed by all waves. Required if also 824 // synchronizing with global/GDS memory as LDS operations could be 825 // reordered with respect to later global/GDS memory operations of the 826 // same wave. 827 LGKMCnt |= IsCrossAddrSpaceOrdering; 828 break; 829 case SIAtomicScope::WAVEFRONT: 830 case SIAtomicScope::SINGLETHREAD: 831 // The LDS keeps all memory operations in order for 832 // the same wavesfront. 833 break; 834 default: 835 llvm_unreachable("Unsupported synchronization scope"); 836 } 837 } 838 839 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 840 switch (Scope) { 841 case SIAtomicScope::SYSTEM: 842 case SIAtomicScope::AGENT: 843 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 844 // is not needed as GDS operations for all waves are executed in a total 845 // global ordering as observed by all waves. Required if also 846 // synchronizing with global/LDS memory as GDS operations could be 847 // reordered with respect to later global/LDS memory operations of the 848 // same wave. 849 LGKMCnt |= IsCrossAddrSpaceOrdering; 850 break; 851 case SIAtomicScope::WORKGROUP: 852 case SIAtomicScope::WAVEFRONT: 853 case SIAtomicScope::SINGLETHREAD: 854 // The GDS keeps all memory operations in order for 855 // the same work-group. 856 break; 857 default: 858 llvm_unreachable("Unsupported synchronization scope"); 859 } 860 } 861 862 if (VMCnt || LGKMCnt) { 863 unsigned WaitCntImmediate = 864 AMDGPU::encodeWaitcnt(IV, 865 VMCnt ? 0 : getVmcntBitMask(IV), 866 getExpcntBitMask(IV), 867 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 868 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 869 Changed = true; 870 } 871 872 if (Pos == Position::AFTER) 873 --MI; 874 875 return Changed; 876 } 877 878 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 879 SIAtomicScope Scope, 880 SIAtomicAddrSpace AddrSpace, 881 bool IsCrossAddrSpaceOrdering, 882 Position Pos) const { 883 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 884 IsCrossAddrSpaceOrdering, Pos); 885 } 886 887 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 888 SIAtomicScope Scope, 889 SIAtomicAddrSpace AddrSpace, 890 Position Pos) const { 891 if (!InsertCacheInv) 892 return false; 893 894 bool Changed = false; 895 896 MachineBasicBlock &MBB = *MI->getParent(); 897 DebugLoc DL = MI->getDebugLoc(); 898 899 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 900 901 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 902 ? AMDGPU::BUFFER_WBINVL1 903 : AMDGPU::BUFFER_WBINVL1_VOL; 904 905 if (Pos == Position::AFTER) 906 ++MI; 907 908 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 909 switch (Scope) { 910 case SIAtomicScope::SYSTEM: 911 case SIAtomicScope::AGENT: 912 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 913 Changed = true; 914 break; 915 case SIAtomicScope::WORKGROUP: 916 case SIAtomicScope::WAVEFRONT: 917 case SIAtomicScope::SINGLETHREAD: 918 // No cache to invalidate. 919 break; 920 default: 921 llvm_unreachable("Unsupported synchronization scope"); 922 } 923 } 924 925 /// The scratch address space does not need the global memory cache 926 /// to be flushed as all memory operations by the same thread are 927 /// sequentially consistent, and no other thread can access scratch 928 /// memory. 929 930 /// Other address spaces do not have a cache. 931 932 if (Pos == Position::AFTER) 933 --MI; 934 935 return Changed; 936 } 937 938 bool SIGfx10CacheControl::enableLoadCacheBypass( 939 const MachineBasicBlock::iterator &MI, 940 SIAtomicScope Scope, 941 SIAtomicAddrSpace AddrSpace) const { 942 assert(MI->mayLoad() && !MI->mayStore()); 943 bool Changed = false; 944 945 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 946 /// TODO Do not set glc for rmw atomic operations as they 947 /// implicitly bypass the L0/L1 caches. 948 949 switch (Scope) { 950 case SIAtomicScope::SYSTEM: 951 case SIAtomicScope::AGENT: 952 Changed |= enableGLCBit(MI); 953 Changed |= enableDLCBit(MI); 954 break; 955 case SIAtomicScope::WORKGROUP: 956 // In WGP mode the waves of a work-group can be executing on either CU of 957 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 958 // CU mode all waves of a work-group are on the same CU, and so the L0 959 // does not need to be bypassed. 960 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 961 break; 962 case SIAtomicScope::WAVEFRONT: 963 case SIAtomicScope::SINGLETHREAD: 964 // No cache to bypass. 965 break; 966 default: 967 llvm_unreachable("Unsupported synchronization scope"); 968 } 969 } 970 971 /// The scratch address space does not need the global memory caches 972 /// to be bypassed as all memory operations by the same thread are 973 /// sequentially consistent, and no other thread can access scratch 974 /// memory. 975 976 /// Other address spaces do not have a cache. 977 978 return Changed; 979 } 980 981 bool SIGfx10CacheControl::enableNonTemporal( 982 const MachineBasicBlock::iterator &MI) const { 983 assert(MI->mayLoad() ^ MI->mayStore()); 984 bool Changed = false; 985 986 Changed |= enableSLCBit(MI); 987 /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) 988 989 return Changed; 990 } 991 992 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 993 SIAtomicScope Scope, 994 SIAtomicAddrSpace AddrSpace, 995 Position Pos) const { 996 if (!InsertCacheInv) 997 return false; 998 999 bool Changed = false; 1000 1001 MachineBasicBlock &MBB = *MI->getParent(); 1002 DebugLoc DL = MI->getDebugLoc(); 1003 1004 if (Pos == Position::AFTER) 1005 ++MI; 1006 1007 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1008 switch (Scope) { 1009 case SIAtomicScope::SYSTEM: 1010 case SIAtomicScope::AGENT: 1011 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1012 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1013 Changed = true; 1014 break; 1015 case SIAtomicScope::WORKGROUP: 1016 // In WGP mode the waves of a work-group can be executing on either CU of 1017 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1018 // in CU mode and all waves of a work-group are on the same CU, and so the 1019 // L0 does not need to be invalidated. 1020 if (!ST.isCuModeEnabled()) { 1021 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1022 Changed = true; 1023 } 1024 break; 1025 case SIAtomicScope::WAVEFRONT: 1026 case SIAtomicScope::SINGLETHREAD: 1027 // No cache to invalidate. 1028 break; 1029 default: 1030 llvm_unreachable("Unsupported synchronization scope"); 1031 } 1032 } 1033 1034 /// The scratch address space does not need the global memory cache 1035 /// to be flushed as all memory operations by the same thread are 1036 /// sequentially consistent, and no other thread can access scratch 1037 /// memory. 1038 1039 /// Other address spaces do not have a cache. 1040 1041 if (Pos == Position::AFTER) 1042 --MI; 1043 1044 return Changed; 1045 } 1046 1047 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1048 SIAtomicScope Scope, 1049 SIAtomicAddrSpace AddrSpace, 1050 SIMemOp Op, 1051 bool IsCrossAddrSpaceOrdering, 1052 Position Pos) const { 1053 bool Changed = false; 1054 1055 MachineBasicBlock &MBB = *MI->getParent(); 1056 DebugLoc DL = MI->getDebugLoc(); 1057 1058 if (Pos == Position::AFTER) 1059 ++MI; 1060 1061 bool VMCnt = false; 1062 bool VSCnt = false; 1063 bool LGKMCnt = false; 1064 1065 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1066 switch (Scope) { 1067 case SIAtomicScope::SYSTEM: 1068 case SIAtomicScope::AGENT: 1069 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1070 VMCnt |= true; 1071 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1072 VSCnt |= true; 1073 break; 1074 case SIAtomicScope::WORKGROUP: 1075 // In WGP mode the waves of a work-group can be executing on either CU of 1076 // the WGP. Therefore need to wait for operations to complete to ensure 1077 // they are visible to waves in the other CU as the L0 is per CU. 1078 // Otherwise in CU mode and all waves of a work-group are on the same CU 1079 // which shares the same L0. 1080 if (!ST.isCuModeEnabled()) { 1081 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1082 VMCnt |= true; 1083 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1084 VSCnt |= true; 1085 } 1086 break; 1087 case SIAtomicScope::WAVEFRONT: 1088 case SIAtomicScope::SINGLETHREAD: 1089 // The L0 cache keeps all memory operations in order for 1090 // work-items in the same wavefront. 1091 break; 1092 default: 1093 llvm_unreachable("Unsupported synchronization scope"); 1094 } 1095 } 1096 1097 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1098 switch (Scope) { 1099 case SIAtomicScope::SYSTEM: 1100 case SIAtomicScope::AGENT: 1101 case SIAtomicScope::WORKGROUP: 1102 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1103 // not needed as LDS operations for all waves are executed in a total 1104 // global ordering as observed by all waves. Required if also 1105 // synchronizing with global/GDS memory as LDS operations could be 1106 // reordered with respect to later global/GDS memory operations of the 1107 // same wave. 1108 LGKMCnt |= IsCrossAddrSpaceOrdering; 1109 break; 1110 case SIAtomicScope::WAVEFRONT: 1111 case SIAtomicScope::SINGLETHREAD: 1112 // The LDS keeps all memory operations in order for 1113 // the same wavesfront. 1114 break; 1115 default: 1116 llvm_unreachable("Unsupported synchronization scope"); 1117 } 1118 } 1119 1120 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1121 switch (Scope) { 1122 case SIAtomicScope::SYSTEM: 1123 case SIAtomicScope::AGENT: 1124 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1125 // is not needed as GDS operations for all waves are executed in a total 1126 // global ordering as observed by all waves. Required if also 1127 // synchronizing with global/LDS memory as GDS operations could be 1128 // reordered with respect to later global/LDS memory operations of the 1129 // same wave. 1130 LGKMCnt |= IsCrossAddrSpaceOrdering; 1131 break; 1132 case SIAtomicScope::WORKGROUP: 1133 case SIAtomicScope::WAVEFRONT: 1134 case SIAtomicScope::SINGLETHREAD: 1135 // The GDS keeps all memory operations in order for 1136 // the same work-group. 1137 break; 1138 default: 1139 llvm_unreachable("Unsupported synchronization scope"); 1140 } 1141 } 1142 1143 if (VMCnt || LGKMCnt) { 1144 unsigned WaitCntImmediate = 1145 AMDGPU::encodeWaitcnt(IV, 1146 VMCnt ? 0 : getVmcntBitMask(IV), 1147 getExpcntBitMask(IV), 1148 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1149 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1150 Changed = true; 1151 } 1152 1153 if (VSCnt) { 1154 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1155 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1156 .addImm(0); 1157 Changed = true; 1158 } 1159 1160 if (Pos == Position::AFTER) 1161 --MI; 1162 1163 return Changed; 1164 } 1165 1166 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1167 if (AtomicPseudoMIs.empty()) 1168 return false; 1169 1170 for (auto &MI : AtomicPseudoMIs) 1171 MI->eraseFromParent(); 1172 1173 AtomicPseudoMIs.clear(); 1174 return true; 1175 } 1176 1177 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1178 MachineBasicBlock::iterator &MI) { 1179 assert(MI->mayLoad() && !MI->mayStore()); 1180 1181 bool Changed = false; 1182 1183 if (MOI.isAtomic()) { 1184 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1185 MOI.getOrdering() == AtomicOrdering::Acquire || 1186 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1187 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1188 MOI.getOrderingAddrSpace()); 1189 } 1190 1191 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1192 Changed |= CC->insertWait(MI, MOI.getScope(), 1193 MOI.getOrderingAddrSpace(), 1194 SIMemOp::LOAD | SIMemOp::STORE, 1195 MOI.getIsCrossAddressSpaceOrdering(), 1196 Position::BEFORE); 1197 1198 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1199 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1200 Changed |= CC->insertWait(MI, MOI.getScope(), 1201 MOI.getInstrAddrSpace(), 1202 SIMemOp::LOAD, 1203 MOI.getIsCrossAddressSpaceOrdering(), 1204 Position::AFTER); 1205 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1206 MOI.getOrderingAddrSpace(), 1207 Position::AFTER); 1208 } 1209 1210 return Changed; 1211 } 1212 1213 // Atomic instructions do not have the nontemporal attribute. 1214 if (MOI.isNonTemporal()) { 1215 Changed |= CC->enableNonTemporal(MI); 1216 return Changed; 1217 } 1218 1219 return Changed; 1220 } 1221 1222 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1223 MachineBasicBlock::iterator &MI) { 1224 assert(!MI->mayLoad() && MI->mayStore()); 1225 1226 bool Changed = false; 1227 1228 if (MOI.isAtomic()) { 1229 if (MOI.getOrdering() == AtomicOrdering::Release || 1230 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1231 Changed |= CC->insertRelease(MI, MOI.getScope(), 1232 MOI.getOrderingAddrSpace(), 1233 MOI.getIsCrossAddressSpaceOrdering(), 1234 Position::BEFORE); 1235 1236 return Changed; 1237 } 1238 1239 // Atomic instructions do not have the nontemporal attribute. 1240 if (MOI.isNonTemporal()) { 1241 Changed |= CC->enableNonTemporal(MI); 1242 return Changed; 1243 } 1244 1245 return Changed; 1246 } 1247 1248 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1249 MachineBasicBlock::iterator &MI) { 1250 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1251 1252 AtomicPseudoMIs.push_back(MI); 1253 bool Changed = false; 1254 1255 if (MOI.isAtomic()) { 1256 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1257 MOI.getOrdering() == AtomicOrdering::Release || 1258 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1259 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1260 /// TODO: This relies on a barrier always generating a waitcnt 1261 /// for LDS to ensure it is not reordered with the completion of 1262 /// the proceeding LDS operations. If barrier had a memory 1263 /// ordering and memory scope, then library does not need to 1264 /// generate a fence. Could add support in this file for 1265 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1266 /// adding S_WAITCNT before a S_BARRIER. 1267 Changed |= CC->insertRelease(MI, MOI.getScope(), 1268 MOI.getOrderingAddrSpace(), 1269 MOI.getIsCrossAddressSpaceOrdering(), 1270 Position::BEFORE); 1271 1272 // TODO: If both release and invalidate are happening they could be combined 1273 // to use the single "BUFFER_WBL2" instruction. This could be done by 1274 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1275 // track cache invalidate and write back instructions. 1276 1277 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1278 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1279 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1280 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1281 MOI.getOrderingAddrSpace(), 1282 Position::BEFORE); 1283 1284 return Changed; 1285 } 1286 1287 return Changed; 1288 } 1289 1290 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1291 MachineBasicBlock::iterator &MI) { 1292 assert(MI->mayLoad() && MI->mayStore()); 1293 1294 bool Changed = false; 1295 1296 if (MOI.isAtomic()) { 1297 if (MOI.getOrdering() == AtomicOrdering::Release || 1298 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1299 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1300 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1301 Changed |= CC->insertRelease(MI, MOI.getScope(), 1302 MOI.getOrderingAddrSpace(), 1303 MOI.getIsCrossAddressSpaceOrdering(), 1304 Position::BEFORE); 1305 1306 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1307 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1308 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1309 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1310 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1311 Changed |= CC->insertWait(MI, MOI.getScope(), 1312 MOI.getOrderingAddrSpace(), 1313 isAtomicRet(*MI) ? SIMemOp::LOAD : 1314 SIMemOp::STORE, 1315 MOI.getIsCrossAddressSpaceOrdering(), 1316 Position::AFTER); 1317 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1318 MOI.getOrderingAddrSpace(), 1319 Position::AFTER); 1320 } 1321 1322 return Changed; 1323 } 1324 1325 return Changed; 1326 } 1327 1328 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1329 bool Changed = false; 1330 1331 SIMemOpAccess MOA(MF); 1332 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1333 1334 for (auto &MBB : MF) { 1335 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1336 1337 if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) { 1338 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1339 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1340 I != E && I->isBundledWithPred(); ++I) { 1341 I->unbundleFromPred(); 1342 for (MachineOperand &MO : I->operands()) 1343 if (MO.isReg()) 1344 MO.setIsInternalRead(false); 1345 } 1346 1347 MI->eraseFromParent(); 1348 MI = II->getIterator(); 1349 } 1350 1351 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1352 continue; 1353 1354 if (const auto &MOI = MOA.getLoadInfo(MI)) 1355 Changed |= expandLoad(MOI.getValue(), MI); 1356 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1357 Changed |= expandStore(MOI.getValue(), MI); 1358 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1359 Changed |= expandAtomicFence(MOI.getValue(), MI); 1360 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1361 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1362 } 1363 } 1364 1365 Changed |= removeAtomicPseudoMIs(); 1366 return Changed; 1367 } 1368 1369 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1370 1371 char SIMemoryLegalizer::ID = 0; 1372 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1373 1374 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1375 return new SIMemoryLegalizer(); 1376 } 1377