1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIDefines.h" 20 #include "SIInstrInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/None.h" 25 #include "llvm/ADT/Optional.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineMemOperand.h" 31 #include "llvm/CodeGen/MachineModuleInfo.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/LLVMContext.h" 37 #include "llvm/MC/MCInstrDesc.h" 38 #include "llvm/Pass.h" 39 #include "llvm/Support/AtomicOrdering.h" 40 #include "llvm/Support/MathExtras.h" 41 #include <cassert> 42 #include <list> 43 44 using namespace llvm; 45 using namespace llvm::AMDGPU; 46 47 #define DEBUG_TYPE "si-memory-legalizer" 48 #define PASS_NAME "SI Memory Legalizer" 49 50 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 51 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 52 cl::desc("Use this to skip inserting cache invalidating instructions.")); 53 54 namespace { 55 56 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 57 58 /// Memory operation flags. Can be ORed together. 59 enum class SIMemOp { 60 NONE = 0u, 61 LOAD = 1u << 0, 62 STORE = 1u << 1, 63 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 64 }; 65 66 /// Position to insert a new instruction relative to an existing 67 /// instruction. 68 enum class Position { 69 BEFORE, 70 AFTER 71 }; 72 73 /// The atomic synchronization scopes supported by the AMDGPU target. 74 enum class SIAtomicScope { 75 NONE, 76 SINGLETHREAD, 77 WAVEFRONT, 78 WORKGROUP, 79 AGENT, 80 SYSTEM 81 }; 82 83 /// The distinct address spaces supported by the AMDGPU target for 84 /// atomic memory operation. Can be ORed toether. 85 enum class SIAtomicAddrSpace { 86 NONE = 0u, 87 GLOBAL = 1u << 0, 88 LDS = 1u << 1, 89 SCRATCH = 1u << 2, 90 GDS = 1u << 3, 91 OTHER = 1u << 4, 92 93 /// The address spaces that can be accessed by a FLAT instruction. 94 FLAT = GLOBAL | LDS | SCRATCH, 95 96 /// The address spaces that support atomic instructions. 97 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 98 99 /// All address spaces. 100 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 101 102 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 103 }; 104 105 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 106 /// \returns Returns true if \p MI is modified, false otherwise. 107 template <uint16_t BitName> 108 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 109 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 110 if (BitIdx == -1) 111 return false; 112 113 MachineOperand &Bit = MI->getOperand(BitIdx); 114 if (Bit.getImm() != 0) 115 return false; 116 117 Bit.setImm(1); 118 return true; 119 } 120 121 class SIMemOpInfo final { 122 private: 123 124 friend class SIMemOpAccess; 125 126 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 127 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 128 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 129 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 130 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 131 bool IsCrossAddressSpaceOrdering = false; 132 bool IsNonTemporal = false; 133 134 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 135 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 136 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 137 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 138 bool IsCrossAddressSpaceOrdering = true, 139 AtomicOrdering FailureOrdering = 140 AtomicOrdering::SequentiallyConsistent, 141 bool IsNonTemporal = false) 142 : Ordering(Ordering), FailureOrdering(FailureOrdering), 143 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 144 InstrAddrSpace(InstrAddrSpace), 145 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 146 IsNonTemporal(IsNonTemporal) { 147 // There is also no cross address space ordering if the ordering 148 // address space is the same as the instruction address space and 149 // only contains a single address space. 150 if ((OrderingAddrSpace == InstrAddrSpace) && 151 isPowerOf2_32(uint32_t(InstrAddrSpace))) 152 this->IsCrossAddressSpaceOrdering = false; 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is non-temporal, false otherwise. 194 bool isNonTemporal() const { 195 return IsNonTemporal; 196 } 197 198 /// \returns True if ordering constraint of the machine instruction used to 199 /// create this SIMemOpInfo is unordered or higher, false otherwise. 200 bool isAtomic() const { 201 return Ordering != AtomicOrdering::NotAtomic; 202 } 203 204 }; 205 206 class SIMemOpAccess final { 207 private: 208 AMDGPUMachineModuleInfo *MMI = nullptr; 209 210 /// Reports unsupported message \p Msg for \p MI to LLVM context. 211 void reportUnsupported(const MachineBasicBlock::iterator &MI, 212 const char *Msg) const; 213 214 /// Inspects the target synchonization scope \p SSID and determines 215 /// the SI atomic scope it corresponds to, the address spaces it 216 /// covers, and whether the memory ordering applies between address 217 /// spaces. 218 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 219 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 220 221 /// \return Return a bit set of the address spaces accessed by \p AS. 222 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 223 224 /// \returns Info constructed from \p MI, which has at least machine memory 225 /// operand. 226 Optional<SIMemOpInfo> constructFromMIWithMMO( 227 const MachineBasicBlock::iterator &MI) const; 228 229 public: 230 /// Construct class to support accessing the machine memory operands 231 /// of instructions in the machine function \p MF. 232 SIMemOpAccess(MachineFunction &MF); 233 234 /// \returns Load info if \p MI is a load operation, "None" otherwise. 235 Optional<SIMemOpInfo> getLoadInfo( 236 const MachineBasicBlock::iterator &MI) const; 237 238 /// \returns Store info if \p MI is a store operation, "None" otherwise. 239 Optional<SIMemOpInfo> getStoreInfo( 240 const MachineBasicBlock::iterator &MI) const; 241 242 /// \returns Atomic fence info if \p MI is an atomic fence operation, 243 /// "None" otherwise. 244 Optional<SIMemOpInfo> getAtomicFenceInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 248 /// rmw operation, "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 }; 252 253 class SICacheControl { 254 protected: 255 256 /// Instruction info. 257 const SIInstrInfo *TII = nullptr; 258 259 IsaVersion IV; 260 261 /// Whether to insert cache invalidating instructions. 262 bool InsertCacheInv; 263 264 SICacheControl(const GCNSubtarget &ST); 265 266 public: 267 268 /// Create a cache control for the subtarget \p ST. 269 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 270 271 /// Update \p MI memory load instruction to bypass any caches up to 272 /// the \p Scope memory scope for address spaces \p 273 /// AddrSpace. Return true iff the instruction was modified. 274 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 275 SIAtomicScope Scope, 276 SIAtomicAddrSpace AddrSpace) const = 0; 277 278 /// Update \p MI memory instruction to indicate it is 279 /// nontemporal. Return true iff the instruction was modified. 280 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) 281 const = 0; 282 283 /// Inserts any necessary instructions at position \p Pos relative 284 /// to instruction \p MI to ensure any caches associated with 285 /// address spaces \p AddrSpace for memory scopes up to memory scope 286 /// \p Scope are invalidated. Returns true iff any instructions 287 /// inserted. 288 virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 289 SIAtomicScope Scope, 290 SIAtomicAddrSpace AddrSpace, 291 Position Pos) const = 0; 292 293 /// Inserts any necessary instructions at position \p Pos relative 294 /// to instruction \p MI to ensure memory instructions of kind \p Op 295 /// associated with address spaces \p AddrSpace have completed as 296 /// observed by other memory instructions executing in memory scope 297 /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory 298 /// ordering is between address spaces. Returns true iff any 299 /// instructions inserted. 300 virtual bool insertWait(MachineBasicBlock::iterator &MI, 301 SIAtomicScope Scope, 302 SIAtomicAddrSpace AddrSpace, 303 SIMemOp Op, 304 bool IsCrossAddrSpaceOrdering, 305 Position Pos) const = 0; 306 307 /// Virtual destructor to allow derivations to be deleted. 308 virtual ~SICacheControl() = default; 309 310 }; 311 312 class SIGfx6CacheControl : public SICacheControl { 313 protected: 314 315 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 316 /// is modified, false otherwise. 317 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 318 return enableNamedBit<AMDGPU::OpName::glc>(MI); 319 } 320 321 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 322 /// is modified, false otherwise. 323 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 324 return enableNamedBit<AMDGPU::OpName::slc>(MI); 325 } 326 327 public: 328 329 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 330 331 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 332 SIAtomicScope Scope, 333 SIAtomicAddrSpace AddrSpace) const override; 334 335 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 336 337 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 338 SIAtomicScope Scope, 339 SIAtomicAddrSpace AddrSpace, 340 Position Pos) const override; 341 342 bool insertWait(MachineBasicBlock::iterator &MI, 343 SIAtomicScope Scope, 344 SIAtomicAddrSpace AddrSpace, 345 SIMemOp Op, 346 bool IsCrossAddrSpaceOrdering, 347 Position Pos) const override; 348 }; 349 350 class SIGfx7CacheControl : public SIGfx6CacheControl { 351 public: 352 353 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 354 355 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 356 SIAtomicScope Scope, 357 SIAtomicAddrSpace AddrSpace, 358 Position Pos) const override; 359 360 }; 361 362 class SIGfx10CacheControl : public SIGfx7CacheControl { 363 protected: 364 bool CuMode = false; 365 366 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 367 /// is modified, false otherwise. 368 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 369 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 370 } 371 372 public: 373 374 SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : 375 SIGfx7CacheControl(ST), CuMode(CuMode) {}; 376 377 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 382 383 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace, 386 Position Pos) const override; 387 388 bool insertWait(MachineBasicBlock::iterator &MI, 389 SIAtomicScope Scope, 390 SIAtomicAddrSpace AddrSpace, 391 SIMemOp Op, 392 bool IsCrossAddrSpaceOrdering, 393 Position Pos) const override; 394 }; 395 396 class SIMemoryLegalizer final : public MachineFunctionPass { 397 private: 398 399 /// Cache Control. 400 std::unique_ptr<SICacheControl> CC = nullptr; 401 402 /// List of atomic pseudo instructions. 403 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 404 405 /// Return true iff instruction \p MI is a atomic instruction that 406 /// returns a result. 407 bool isAtomicRet(const MachineInstr &MI) const { 408 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 409 } 410 411 /// Removes all processed atomic pseudo instructions from the current 412 /// function. Returns true if current function is modified, false otherwise. 413 bool removeAtomicPseudoMIs(); 414 415 /// Expands load operation \p MI. Returns true if instructions are 416 /// added/deleted or \p MI is modified, false otherwise. 417 bool expandLoad(const SIMemOpInfo &MOI, 418 MachineBasicBlock::iterator &MI); 419 /// Expands store operation \p MI. Returns true if instructions are 420 /// added/deleted or \p MI is modified, false otherwise. 421 bool expandStore(const SIMemOpInfo &MOI, 422 MachineBasicBlock::iterator &MI); 423 /// Expands atomic fence operation \p MI. Returns true if 424 /// instructions are added/deleted or \p MI is modified, false otherwise. 425 bool expandAtomicFence(const SIMemOpInfo &MOI, 426 MachineBasicBlock::iterator &MI); 427 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 428 /// instructions are added/deleted or \p MI is modified, false otherwise. 429 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 430 MachineBasicBlock::iterator &MI); 431 432 public: 433 static char ID; 434 435 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 436 437 void getAnalysisUsage(AnalysisUsage &AU) const override { 438 AU.setPreservesCFG(); 439 MachineFunctionPass::getAnalysisUsage(AU); 440 } 441 442 StringRef getPassName() const override { 443 return PASS_NAME; 444 } 445 446 bool runOnMachineFunction(MachineFunction &MF) override; 447 }; 448 449 } // end namespace anonymous 450 451 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 452 const char *Msg) const { 453 const Function &Func = MI->getParent()->getParent()->getFunction(); 454 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 455 Func.getContext().diagnose(Diag); 456 } 457 458 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 459 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 460 SIAtomicAddrSpace InstrScope) const { 461 if (SSID == SyncScope::System) 462 return std::make_tuple(SIAtomicScope::SYSTEM, 463 SIAtomicAddrSpace::ATOMIC, 464 true); 465 if (SSID == MMI->getAgentSSID()) 466 return std::make_tuple(SIAtomicScope::AGENT, 467 SIAtomicAddrSpace::ATOMIC, 468 true); 469 if (SSID == MMI->getWorkgroupSSID()) 470 return std::make_tuple(SIAtomicScope::WORKGROUP, 471 SIAtomicAddrSpace::ATOMIC, 472 true); 473 if (SSID == MMI->getWavefrontSSID()) 474 return std::make_tuple(SIAtomicScope::WAVEFRONT, 475 SIAtomicAddrSpace::ATOMIC, 476 true); 477 if (SSID == SyncScope::SingleThread) 478 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 479 SIAtomicAddrSpace::ATOMIC, 480 true); 481 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 482 return std::make_tuple(SIAtomicScope::SYSTEM, 483 SIAtomicAddrSpace::ATOMIC & InstrScope, 484 false); 485 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 486 return std::make_tuple(SIAtomicScope::AGENT, 487 SIAtomicAddrSpace::ATOMIC & InstrScope, 488 false); 489 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 490 return std::make_tuple(SIAtomicScope::WORKGROUP, 491 SIAtomicAddrSpace::ATOMIC & InstrScope, 492 false); 493 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 494 return std::make_tuple(SIAtomicScope::WAVEFRONT, 495 SIAtomicAddrSpace::ATOMIC & InstrScope, 496 false); 497 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 498 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 499 SIAtomicAddrSpace::ATOMIC & InstrScope, 500 false); 501 return None; 502 } 503 504 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 505 if (AS == AMDGPUAS::FLAT_ADDRESS) 506 return SIAtomicAddrSpace::FLAT; 507 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 508 return SIAtomicAddrSpace::GLOBAL; 509 if (AS == AMDGPUAS::LOCAL_ADDRESS) 510 return SIAtomicAddrSpace::LDS; 511 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 512 return SIAtomicAddrSpace::SCRATCH; 513 if (AS == AMDGPUAS::REGION_ADDRESS) 514 return SIAtomicAddrSpace::GDS; 515 516 return SIAtomicAddrSpace::OTHER; 517 } 518 519 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 520 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 521 } 522 523 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 524 const MachineBasicBlock::iterator &MI) const { 525 assert(MI->getNumMemOperands() > 0); 526 527 SyncScope::ID SSID = SyncScope::SingleThread; 528 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 529 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 530 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 531 bool IsNonTemporal = true; 532 533 // Validator should check whether or not MMOs cover the entire set of 534 // locations accessed by the memory instruction. 535 for (const auto &MMO : MI->memoperands()) { 536 IsNonTemporal &= MMO->isNonTemporal(); 537 InstrAddrSpace |= 538 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 539 AtomicOrdering OpOrdering = MMO->getOrdering(); 540 if (OpOrdering != AtomicOrdering::NotAtomic) { 541 const auto &IsSyncScopeInclusion = 542 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 543 if (!IsSyncScopeInclusion) { 544 reportUnsupported(MI, 545 "Unsupported non-inclusive atomic synchronization scope"); 546 return None; 547 } 548 549 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 550 Ordering = 551 isStrongerThan(Ordering, OpOrdering) ? 552 Ordering : MMO->getOrdering(); 553 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 554 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 555 FailureOrdering = 556 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 557 FailureOrdering : MMO->getFailureOrdering(); 558 } 559 } 560 561 SIAtomicScope Scope = SIAtomicScope::NONE; 562 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 563 bool IsCrossAddressSpaceOrdering = false; 564 if (Ordering != AtomicOrdering::NotAtomic) { 565 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 566 if (!ScopeOrNone) { 567 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 568 return None; 569 } 570 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 571 ScopeOrNone.getValue(); 572 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 573 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 574 reportUnsupported(MI, "Unsupported atomic address space"); 575 return None; 576 } 577 } 578 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 579 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); 580 } 581 582 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 583 const MachineBasicBlock::iterator &MI) const { 584 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 585 586 if (!(MI->mayLoad() && !MI->mayStore())) 587 return None; 588 589 // Be conservative if there are no memory operands. 590 if (MI->getNumMemOperands() == 0) 591 return SIMemOpInfo(); 592 593 return constructFromMIWithMMO(MI); 594 } 595 596 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 597 const MachineBasicBlock::iterator &MI) const { 598 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 599 600 if (!(!MI->mayLoad() && MI->mayStore())) 601 return None; 602 603 // Be conservative if there are no memory operands. 604 if (MI->getNumMemOperands() == 0) 605 return SIMemOpInfo(); 606 607 return constructFromMIWithMMO(MI); 608 } 609 610 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 611 const MachineBasicBlock::iterator &MI) const { 612 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 613 614 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 615 return None; 616 617 AtomicOrdering Ordering = 618 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 619 620 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 621 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 622 if (!ScopeOrNone) { 623 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 624 return None; 625 } 626 627 SIAtomicScope Scope = SIAtomicScope::NONE; 628 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 629 bool IsCrossAddressSpaceOrdering = false; 630 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 631 ScopeOrNone.getValue(); 632 633 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 634 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 635 reportUnsupported(MI, "Unsupported atomic address space"); 636 return None; 637 } 638 639 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 640 IsCrossAddressSpaceOrdering); 641 } 642 643 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 644 const MachineBasicBlock::iterator &MI) const { 645 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 646 647 if (!(MI->mayLoad() && MI->mayStore())) 648 return None; 649 650 // Be conservative if there are no memory operands. 651 if (MI->getNumMemOperands() == 0) 652 return SIMemOpInfo(); 653 654 return constructFromMIWithMMO(MI); 655 } 656 657 SICacheControl::SICacheControl(const GCNSubtarget &ST) { 658 TII = ST.getInstrInfo(); 659 IV = getIsaVersion(ST.getCPU()); 660 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 661 } 662 663 /* static */ 664 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 665 GCNSubtarget::Generation Generation = ST.getGeneration(); 666 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 667 return std::make_unique<SIGfx6CacheControl>(ST); 668 if (Generation < AMDGPUSubtarget::GFX10) 669 return std::make_unique<SIGfx7CacheControl>(ST); 670 return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); 671 } 672 673 bool SIGfx6CacheControl::enableLoadCacheBypass( 674 const MachineBasicBlock::iterator &MI, 675 SIAtomicScope Scope, 676 SIAtomicAddrSpace AddrSpace) const { 677 assert(MI->mayLoad() && !MI->mayStore()); 678 bool Changed = false; 679 680 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 681 /// TODO: Do not set glc for rmw atomic operations as they 682 /// implicitly bypass the L1 cache. 683 684 switch (Scope) { 685 case SIAtomicScope::SYSTEM: 686 case SIAtomicScope::AGENT: 687 Changed |= enableGLCBit(MI); 688 break; 689 case SIAtomicScope::WORKGROUP: 690 case SIAtomicScope::WAVEFRONT: 691 case SIAtomicScope::SINGLETHREAD: 692 // No cache to bypass. 693 break; 694 default: 695 llvm_unreachable("Unsupported synchronization scope"); 696 } 697 } 698 699 /// The scratch address space does not need the global memory caches 700 /// to be bypassed as all memory operations by the same thread are 701 /// sequentially consistent, and no other thread can access scratch 702 /// memory. 703 704 /// Other address spaces do not hava a cache. 705 706 return Changed; 707 } 708 709 bool SIGfx6CacheControl::enableNonTemporal( 710 const MachineBasicBlock::iterator &MI) const { 711 assert(MI->mayLoad() ^ MI->mayStore()); 712 bool Changed = false; 713 714 /// TODO: Do not enableGLCBit if rmw atomic. 715 Changed |= enableGLCBit(MI); 716 Changed |= enableSLCBit(MI); 717 718 return Changed; 719 } 720 721 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, 722 SIAtomicScope Scope, 723 SIAtomicAddrSpace AddrSpace, 724 Position Pos) const { 725 if (!InsertCacheInv) 726 return false; 727 728 bool Changed = false; 729 730 MachineBasicBlock &MBB = *MI->getParent(); 731 DebugLoc DL = MI->getDebugLoc(); 732 733 if (Pos == Position::AFTER) 734 ++MI; 735 736 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 737 switch (Scope) { 738 case SIAtomicScope::SYSTEM: 739 case SIAtomicScope::AGENT: 740 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 741 Changed = true; 742 break; 743 case SIAtomicScope::WORKGROUP: 744 case SIAtomicScope::WAVEFRONT: 745 case SIAtomicScope::SINGLETHREAD: 746 // No cache to invalidate. 747 break; 748 default: 749 llvm_unreachable("Unsupported synchronization scope"); 750 } 751 } 752 753 /// The scratch address space does not need the global memory cache 754 /// to be flushed as all memory operations by the same thread are 755 /// sequentially consistent, and no other thread can access scratch 756 /// memory. 757 758 /// Other address spaces do not hava a cache. 759 760 if (Pos == Position::AFTER) 761 --MI; 762 763 return Changed; 764 } 765 766 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 767 SIAtomicScope Scope, 768 SIAtomicAddrSpace AddrSpace, 769 SIMemOp Op, 770 bool IsCrossAddrSpaceOrdering, 771 Position Pos) const { 772 bool Changed = false; 773 774 MachineBasicBlock &MBB = *MI->getParent(); 775 DebugLoc DL = MI->getDebugLoc(); 776 777 if (Pos == Position::AFTER) 778 ++MI; 779 780 bool VMCnt = false; 781 bool LGKMCnt = false; 782 783 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 784 switch (Scope) { 785 case SIAtomicScope::SYSTEM: 786 case SIAtomicScope::AGENT: 787 VMCnt |= true; 788 break; 789 case SIAtomicScope::WORKGROUP: 790 case SIAtomicScope::WAVEFRONT: 791 case SIAtomicScope::SINGLETHREAD: 792 // The L1 cache keeps all memory operations in order for 793 // wavefronts in the same work-group. 794 break; 795 default: 796 llvm_unreachable("Unsupported synchronization scope"); 797 } 798 } 799 800 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 801 switch (Scope) { 802 case SIAtomicScope::SYSTEM: 803 case SIAtomicScope::AGENT: 804 case SIAtomicScope::WORKGROUP: 805 // If no cross address space ordering then an LDS waitcnt is not 806 // needed as LDS operations for all waves are executed in a 807 // total global ordering as observed by all waves. Required if 808 // also synchronizing with global/GDS memory as LDS operations 809 // could be reordered with respect to later global/GDS memory 810 // operations of the same wave. 811 LGKMCnt |= IsCrossAddrSpaceOrdering; 812 break; 813 case SIAtomicScope::WAVEFRONT: 814 case SIAtomicScope::SINGLETHREAD: 815 // The LDS keeps all memory operations in order for 816 // the same wavesfront. 817 break; 818 default: 819 llvm_unreachable("Unsupported synchronization scope"); 820 } 821 } 822 823 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 824 switch (Scope) { 825 case SIAtomicScope::SYSTEM: 826 case SIAtomicScope::AGENT: 827 // If no cross address space ordering then an GDS waitcnt is not 828 // needed as GDS operations for all waves are executed in a 829 // total global ordering as observed by all waves. Required if 830 // also synchronizing with global/LDS memory as GDS operations 831 // could be reordered with respect to later global/LDS memory 832 // operations of the same wave. 833 LGKMCnt |= IsCrossAddrSpaceOrdering; 834 break; 835 case SIAtomicScope::WORKGROUP: 836 case SIAtomicScope::WAVEFRONT: 837 case SIAtomicScope::SINGLETHREAD: 838 // The GDS keeps all memory operations in order for 839 // the same work-group. 840 break; 841 default: 842 llvm_unreachable("Unsupported synchronization scope"); 843 } 844 } 845 846 if (VMCnt || LGKMCnt) { 847 unsigned WaitCntImmediate = 848 AMDGPU::encodeWaitcnt(IV, 849 VMCnt ? 0 : getVmcntBitMask(IV), 850 getExpcntBitMask(IV), 851 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 852 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 853 Changed = true; 854 } 855 856 if (Pos == Position::AFTER) 857 --MI; 858 859 return Changed; 860 } 861 862 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, 863 SIAtomicScope Scope, 864 SIAtomicAddrSpace AddrSpace, 865 Position Pos) const { 866 if (!InsertCacheInv) 867 return false; 868 869 bool Changed = false; 870 871 MachineBasicBlock &MBB = *MI->getParent(); 872 DebugLoc DL = MI->getDebugLoc(); 873 874 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 875 876 const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() 877 ? AMDGPU::BUFFER_WBINVL1 878 : AMDGPU::BUFFER_WBINVL1_VOL; 879 880 if (Pos == Position::AFTER) 881 ++MI; 882 883 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 884 switch (Scope) { 885 case SIAtomicScope::SYSTEM: 886 case SIAtomicScope::AGENT: 887 BuildMI(MBB, MI, DL, TII->get(Flush)); 888 Changed = true; 889 break; 890 case SIAtomicScope::WORKGROUP: 891 case SIAtomicScope::WAVEFRONT: 892 case SIAtomicScope::SINGLETHREAD: 893 // No cache to invalidate. 894 break; 895 default: 896 llvm_unreachable("Unsupported synchronization scope"); 897 } 898 } 899 900 /// The scratch address space does not need the global memory cache 901 /// to be flushed as all memory operations by the same thread are 902 /// sequentially consistent, and no other thread can access scratch 903 /// memory. 904 905 /// Other address spaces do not hava a cache. 906 907 if (Pos == Position::AFTER) 908 --MI; 909 910 return Changed; 911 } 912 913 bool SIGfx10CacheControl::enableLoadCacheBypass( 914 const MachineBasicBlock::iterator &MI, 915 SIAtomicScope Scope, 916 SIAtomicAddrSpace AddrSpace) const { 917 assert(MI->mayLoad() && !MI->mayStore()); 918 bool Changed = false; 919 920 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 921 /// TODO Do not set glc for rmw atomic operations as they 922 /// implicitly bypass the L0/L1 caches. 923 924 switch (Scope) { 925 case SIAtomicScope::SYSTEM: 926 case SIAtomicScope::AGENT: 927 Changed |= enableGLCBit(MI); 928 Changed |= enableDLCBit(MI); 929 break; 930 case SIAtomicScope::WORKGROUP: 931 // In WGP mode the waves of a work-group can be executing on either CU of 932 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 933 // CU mode and all waves of a work-group are on the same CU, and so the 934 // L0 does not need to be bypassed. 935 if (!CuMode) Changed |= enableGLCBit(MI); 936 break; 937 case SIAtomicScope::WAVEFRONT: 938 case SIAtomicScope::SINGLETHREAD: 939 // No cache to bypass. 940 break; 941 default: 942 llvm_unreachable("Unsupported synchronization scope"); 943 } 944 } 945 946 /// The scratch address space does not need the global memory caches 947 /// to be bypassed as all memory operations by the same thread are 948 /// sequentially consistent, and no other thread can access scratch 949 /// memory. 950 951 /// Other address spaces do not hava a cache. 952 953 return Changed; 954 } 955 956 bool SIGfx10CacheControl::enableNonTemporal( 957 const MachineBasicBlock::iterator &MI) const { 958 assert(MI->mayLoad() ^ MI->mayStore()); 959 bool Changed = false; 960 961 Changed |= enableSLCBit(MI); 962 /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) 963 964 return Changed; 965 } 966 967 bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, 968 SIAtomicScope Scope, 969 SIAtomicAddrSpace AddrSpace, 970 Position Pos) const { 971 if (!InsertCacheInv) 972 return false; 973 974 bool Changed = false; 975 976 MachineBasicBlock &MBB = *MI->getParent(); 977 DebugLoc DL = MI->getDebugLoc(); 978 979 if (Pos == Position::AFTER) 980 ++MI; 981 982 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 983 switch (Scope) { 984 case SIAtomicScope::SYSTEM: 985 case SIAtomicScope::AGENT: 986 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 987 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 988 Changed = true; 989 break; 990 case SIAtomicScope::WORKGROUP: 991 // In WGP mode the waves of a work-group can be executing on either CU of 992 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 993 // in CU mode and all waves of a work-group are on the same CU, and so the 994 // L0 does not need to be invalidated. 995 if (!CuMode) { 996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 997 Changed = true; 998 } 999 break; 1000 case SIAtomicScope::WAVEFRONT: 1001 case SIAtomicScope::SINGLETHREAD: 1002 // No cache to invalidate. 1003 break; 1004 default: 1005 llvm_unreachable("Unsupported synchronization scope"); 1006 } 1007 } 1008 1009 /// The scratch address space does not need the global memory cache 1010 /// to be flushed as all memory operations by the same thread are 1011 /// sequentially consistent, and no other thread can access scratch 1012 /// memory. 1013 1014 /// Other address spaces do not hava a cache. 1015 1016 if (Pos == Position::AFTER) 1017 --MI; 1018 1019 return Changed; 1020 } 1021 1022 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1023 SIAtomicScope Scope, 1024 SIAtomicAddrSpace AddrSpace, 1025 SIMemOp Op, 1026 bool IsCrossAddrSpaceOrdering, 1027 Position Pos) const { 1028 bool Changed = false; 1029 1030 MachineBasicBlock &MBB = *MI->getParent(); 1031 DebugLoc DL = MI->getDebugLoc(); 1032 1033 if (Pos == Position::AFTER) 1034 ++MI; 1035 1036 bool VMCnt = false; 1037 bool VSCnt = false; 1038 bool LGKMCnt = false; 1039 1040 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1041 switch (Scope) { 1042 case SIAtomicScope::SYSTEM: 1043 case SIAtomicScope::AGENT: 1044 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1045 VMCnt |= true; 1046 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1047 VSCnt |= true; 1048 break; 1049 case SIAtomicScope::WORKGROUP: 1050 // In WGP mode the waves of a work-group can be executing on either CU of 1051 // the WGP. Therefore need to wait for operations to complete to ensure 1052 // they are visible to waves in the other CU as the L0 is per CU. 1053 // Otherwise in CU mode and all waves of a work-group are on the same CU 1054 // which shares the same L0. 1055 if (!CuMode) { 1056 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1057 VMCnt |= true; 1058 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1059 VSCnt |= true; 1060 } 1061 break; 1062 case SIAtomicScope::WAVEFRONT: 1063 case SIAtomicScope::SINGLETHREAD: 1064 // The L0 cache keeps all memory operations in order for 1065 // work-items in the same wavefront. 1066 break; 1067 default: 1068 llvm_unreachable("Unsupported synchronization scope"); 1069 } 1070 } 1071 1072 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1073 switch (Scope) { 1074 case SIAtomicScope::SYSTEM: 1075 case SIAtomicScope::AGENT: 1076 case SIAtomicScope::WORKGROUP: 1077 // If no cross address space ordering then an LDS waitcnt is not 1078 // needed as LDS operations for all waves are executed in a 1079 // total global ordering as observed by all waves. Required if 1080 // also synchronizing with global/GDS memory as LDS operations 1081 // could be reordered with respect to later global/GDS memory 1082 // operations of the same wave. 1083 LGKMCnt |= IsCrossAddrSpaceOrdering; 1084 break; 1085 case SIAtomicScope::WAVEFRONT: 1086 case SIAtomicScope::SINGLETHREAD: 1087 // The LDS keeps all memory operations in order for 1088 // the same wavesfront. 1089 break; 1090 default: 1091 llvm_unreachable("Unsupported synchronization scope"); 1092 } 1093 } 1094 1095 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1096 switch (Scope) { 1097 case SIAtomicScope::SYSTEM: 1098 case SIAtomicScope::AGENT: 1099 // If no cross address space ordering then an GDS waitcnt is not 1100 // needed as GDS operations for all waves are executed in a 1101 // total global ordering as observed by all waves. Required if 1102 // also synchronizing with global/LDS memory as GDS operations 1103 // could be reordered with respect to later global/LDS memory 1104 // operations of the same wave. 1105 LGKMCnt |= IsCrossAddrSpaceOrdering; 1106 break; 1107 case SIAtomicScope::WORKGROUP: 1108 case SIAtomicScope::WAVEFRONT: 1109 case SIAtomicScope::SINGLETHREAD: 1110 // The GDS keeps all memory operations in order for 1111 // the same work-group. 1112 break; 1113 default: 1114 llvm_unreachable("Unsupported synchronization scope"); 1115 } 1116 } 1117 1118 if (VMCnt || LGKMCnt) { 1119 unsigned WaitCntImmediate = 1120 AMDGPU::encodeWaitcnt(IV, 1121 VMCnt ? 0 : getVmcntBitMask(IV), 1122 getExpcntBitMask(IV), 1123 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1124 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1125 Changed = true; 1126 } 1127 1128 if (VSCnt) { 1129 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1130 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1131 .addImm(0); 1132 Changed = true; 1133 } 1134 1135 if (Pos == Position::AFTER) 1136 --MI; 1137 1138 return Changed; 1139 } 1140 1141 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1142 if (AtomicPseudoMIs.empty()) 1143 return false; 1144 1145 for (auto &MI : AtomicPseudoMIs) 1146 MI->eraseFromParent(); 1147 1148 AtomicPseudoMIs.clear(); 1149 return true; 1150 } 1151 1152 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1153 MachineBasicBlock::iterator &MI) { 1154 assert(MI->mayLoad() && !MI->mayStore()); 1155 1156 bool Changed = false; 1157 1158 if (MOI.isAtomic()) { 1159 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1160 MOI.getOrdering() == AtomicOrdering::Acquire || 1161 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1162 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1163 MOI.getOrderingAddrSpace()); 1164 } 1165 1166 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1167 Changed |= CC->insertWait(MI, MOI.getScope(), 1168 MOI.getOrderingAddrSpace(), 1169 SIMemOp::LOAD | SIMemOp::STORE, 1170 MOI.getIsCrossAddressSpaceOrdering(), 1171 Position::BEFORE); 1172 1173 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1174 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1175 Changed |= CC->insertWait(MI, MOI.getScope(), 1176 MOI.getInstrAddrSpace(), 1177 SIMemOp::LOAD, 1178 MOI.getIsCrossAddressSpaceOrdering(), 1179 Position::AFTER); 1180 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 1181 MOI.getOrderingAddrSpace(), 1182 Position::AFTER); 1183 } 1184 1185 return Changed; 1186 } 1187 1188 // Atomic instructions do not have the nontemporal attribute. 1189 if (MOI.isNonTemporal()) { 1190 Changed |= CC->enableNonTemporal(MI); 1191 return Changed; 1192 } 1193 1194 return Changed; 1195 } 1196 1197 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1198 MachineBasicBlock::iterator &MI) { 1199 assert(!MI->mayLoad() && MI->mayStore()); 1200 1201 bool Changed = false; 1202 1203 if (MOI.isAtomic()) { 1204 if (MOI.getOrdering() == AtomicOrdering::Release || 1205 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1206 Changed |= CC->insertWait(MI, MOI.getScope(), 1207 MOI.getOrderingAddrSpace(), 1208 SIMemOp::LOAD | SIMemOp::STORE, 1209 MOI.getIsCrossAddressSpaceOrdering(), 1210 Position::BEFORE); 1211 1212 return Changed; 1213 } 1214 1215 // Atomic instructions do not have the nontemporal attribute. 1216 if (MOI.isNonTemporal()) { 1217 Changed |= CC->enableNonTemporal(MI); 1218 return Changed; 1219 } 1220 1221 return Changed; 1222 } 1223 1224 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1225 MachineBasicBlock::iterator &MI) { 1226 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1227 1228 AtomicPseudoMIs.push_back(MI); 1229 bool Changed = false; 1230 1231 if (MOI.isAtomic()) { 1232 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1233 MOI.getOrdering() == AtomicOrdering::Release || 1234 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1235 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1236 /// TODO: This relies on a barrier always generating a waitcnt 1237 /// for LDS to ensure it is not reordered with the completion of 1238 /// the proceeding LDS operations. If barrier had a memory 1239 /// ordering and memory scope, then library does not need to 1240 /// generate a fence. Could add support in this file for 1241 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1242 /// adding waitcnt before a S_BARRIER. 1243 Changed |= CC->insertWait(MI, MOI.getScope(), 1244 MOI.getOrderingAddrSpace(), 1245 SIMemOp::LOAD | SIMemOp::STORE, 1246 MOI.getIsCrossAddressSpaceOrdering(), 1247 Position::BEFORE); 1248 1249 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1250 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1251 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1252 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 1253 MOI.getOrderingAddrSpace(), 1254 Position::BEFORE); 1255 1256 return Changed; 1257 } 1258 1259 return Changed; 1260 } 1261 1262 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1263 MachineBasicBlock::iterator &MI) { 1264 assert(MI->mayLoad() && MI->mayStore()); 1265 1266 bool Changed = false; 1267 1268 if (MOI.isAtomic()) { 1269 if (MOI.getOrdering() == AtomicOrdering::Release || 1270 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1271 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1272 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1273 Changed |= CC->insertWait(MI, MOI.getScope(), 1274 MOI.getOrderingAddrSpace(), 1275 SIMemOp::LOAD | SIMemOp::STORE, 1276 MOI.getIsCrossAddressSpaceOrdering(), 1277 Position::BEFORE); 1278 1279 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1280 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1281 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1282 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1283 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1284 Changed |= CC->insertWait(MI, MOI.getScope(), 1285 MOI.getOrderingAddrSpace(), 1286 isAtomicRet(*MI) ? SIMemOp::LOAD : 1287 SIMemOp::STORE, 1288 MOI.getIsCrossAddressSpaceOrdering(), 1289 Position::AFTER); 1290 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 1291 MOI.getOrderingAddrSpace(), 1292 Position::AFTER); 1293 } 1294 1295 return Changed; 1296 } 1297 1298 return Changed; 1299 } 1300 1301 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1302 bool Changed = false; 1303 1304 SIMemOpAccess MOA(MF); 1305 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1306 1307 for (auto &MBB : MF) { 1308 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1309 1310 if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) { 1311 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1312 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1313 I != E && I->isBundledWithPred(); ++I) { 1314 I->unbundleFromPred(); 1315 for (MachineOperand &MO : I->operands()) 1316 if (MO.isReg()) 1317 MO.setIsInternalRead(false); 1318 } 1319 1320 MI->eraseFromParent(); 1321 MI = II->getIterator(); 1322 } 1323 1324 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1325 continue; 1326 1327 if (const auto &MOI = MOA.getLoadInfo(MI)) 1328 Changed |= expandLoad(MOI.getValue(), MI); 1329 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1330 Changed |= expandStore(MOI.getValue(), MI); 1331 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1332 Changed |= expandAtomicFence(MOI.getValue(), MI); 1333 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1334 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1335 } 1336 } 1337 1338 Changed |= removeAtomicPseudoMIs(); 1339 return Changed; 1340 } 1341 1342 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1343 1344 char SIMemoryLegalizer::ID = 0; 1345 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1346 1347 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1348 return new SIMemoryLegalizer(); 1349 } 1350