1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Memory legalizer - implements memory model. More information can be 12 /// found here: 13 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "AMDGPUMachineModuleInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIDefines.h" 21 #include "SIInstrInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/ADT/BitmaskEnum.h" 25 #include "llvm/ADT/None.h" 26 #include "llvm/ADT/Optional.h" 27 #include "llvm/CodeGen/MachineBasicBlock.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineMemOperand.h" 32 #include "llvm/CodeGen/MachineModuleInfo.h" 33 #include "llvm/CodeGen/MachineOperand.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/LLVMContext.h" 38 #include "llvm/MC/MCInstrDesc.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/AtomicOrdering.h" 41 #include "llvm/Support/MathExtras.h" 42 #include <cassert> 43 #include <list> 44 45 using namespace llvm; 46 using namespace llvm::AMDGPU; 47 48 #define DEBUG_TYPE "si-memory-legalizer" 49 #define PASS_NAME "SI Memory Legalizer" 50 51 namespace { 52 53 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 54 55 /// Memory operation flags. Can be ORed together. 56 enum class SIMemOp { 57 NONE = 0u, 58 LOAD = 1u << 0, 59 STORE = 1u << 1, 60 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 61 }; 62 63 /// Position to insert a new instruction relative to an existing 64 /// instruction. 65 enum class Position { 66 BEFORE, 67 AFTER 68 }; 69 70 /// The atomic synchronization scopes supported by the AMDGPU target. 71 enum class SIAtomicScope { 72 NONE, 73 SINGLETHREAD, 74 WAVEFRONT, 75 WORKGROUP, 76 AGENT, 77 SYSTEM 78 }; 79 80 /// The distinct address spaces supported by the AMDGPU target for 81 /// atomic memory operation. Can be ORed toether. 82 enum class SIAtomicAddrSpace { 83 NONE = 0u, 84 GLOBAL = 1u << 0, 85 LDS = 1u << 1, 86 SCRATCH = 1u << 2, 87 GDS = 1u << 3, 88 OTHER = 1u << 4, 89 90 /// The address spaces that can be accessed by a FLAT instruction. 91 FLAT = GLOBAL | LDS | SCRATCH, 92 93 /// The address spaces that support atomic instructions. 94 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 95 96 /// All address spaces. 97 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 98 99 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 100 }; 101 102 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 103 /// \returns Returns true if \p MI is modified, false otherwise. 104 template <uint16_t BitName> 105 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 106 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 107 if (BitIdx == -1) 108 return false; 109 110 MachineOperand &Bit = MI->getOperand(BitIdx); 111 if (Bit.getImm() != 0) 112 return false; 113 114 Bit.setImm(1); 115 return true; 116 } 117 118 class SIMemOpInfo final { 119 private: 120 121 friend class SIMemOpAccess; 122 123 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 124 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 125 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 126 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 127 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 128 bool IsCrossAddressSpaceOrdering = false; 129 bool IsNonTemporal = false; 130 131 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 132 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 133 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 134 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 135 bool IsCrossAddressSpaceOrdering = true, 136 AtomicOrdering FailureOrdering = 137 AtomicOrdering::SequentiallyConsistent, 138 bool IsNonTemporal = false) 139 : Ordering(Ordering), FailureOrdering(FailureOrdering), 140 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 141 InstrAddrSpace(InstrAddrSpace), 142 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 143 IsNonTemporal(IsNonTemporal) { 144 // There is also no cross address space ordering if the ordering 145 // address space is the same as the instruction address space and 146 // only contains a single address space. 147 if ((OrderingAddrSpace == InstrAddrSpace) && 148 isPowerOf2_32(uint32_t(InstrAddrSpace))) 149 IsCrossAddressSpaceOrdering = false; 150 } 151 152 public: 153 /// \returns Atomic synchronization scope of the machine instruction used to 154 /// create this SIMemOpInfo. 155 SIAtomicScope getScope() const { 156 return Scope; 157 } 158 159 /// \returns Ordering constraint of the machine instruction used to 160 /// create this SIMemOpInfo. 161 AtomicOrdering getOrdering() const { 162 return Ordering; 163 } 164 165 /// \returns Failure ordering constraint of the machine instruction used to 166 /// create this SIMemOpInfo. 167 AtomicOrdering getFailureOrdering() const { 168 return FailureOrdering; 169 } 170 171 /// \returns The address spaces be accessed by the machine 172 /// instruction used to create this SiMemOpInfo. 173 SIAtomicAddrSpace getInstrAddrSpace() const { 174 return InstrAddrSpace; 175 } 176 177 /// \returns The address spaces that must be ordered by the machine 178 /// instruction used to create this SiMemOpInfo. 179 SIAtomicAddrSpace getOrderingAddrSpace() const { 180 return OrderingAddrSpace; 181 } 182 183 /// \returns Return true iff memory ordering of operations on 184 /// different address spaces is required. 185 bool getIsCrossAddressSpaceOrdering() const { 186 return IsCrossAddressSpaceOrdering; 187 } 188 189 /// \returns True if memory access of the machine instruction used to 190 /// create this SIMemOpInfo is non-temporal, false otherwise. 191 bool isNonTemporal() const { 192 return IsNonTemporal; 193 } 194 195 /// \returns True if ordering constraint of the machine instruction used to 196 /// create this SIMemOpInfo is unordered or higher, false otherwise. 197 bool isAtomic() const { 198 return Ordering != AtomicOrdering::NotAtomic; 199 } 200 201 }; 202 203 class SIMemOpAccess final { 204 private: 205 AMDGPUMachineModuleInfo *MMI = nullptr; 206 207 /// Reports unsupported message \p Msg for \p MI to LLVM context. 208 void reportUnsupported(const MachineBasicBlock::iterator &MI, 209 const char *Msg) const; 210 211 /// Inspects the target synchonization scope \p SSID and determines 212 /// the SI atomic scope it corresponds to, the address spaces it 213 /// covers, and whether the memory ordering applies between address 214 /// spaces. 215 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 216 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 217 218 /// \return Return a bit set of the address spaces accessed by \p AS. 219 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 220 221 /// \returns Info constructed from \p MI, which has at least machine memory 222 /// operand. 223 Optional<SIMemOpInfo> constructFromMIWithMMO( 224 const MachineBasicBlock::iterator &MI) const; 225 226 public: 227 /// Construct class to support accessing the machine memory operands 228 /// of instructions in the machine function \p MF. 229 SIMemOpAccess(MachineFunction &MF); 230 231 /// \returns Load info if \p MI is a load operation, "None" otherwise. 232 Optional<SIMemOpInfo> getLoadInfo( 233 const MachineBasicBlock::iterator &MI) const; 234 235 /// \returns Store info if \p MI is a store operation, "None" otherwise. 236 Optional<SIMemOpInfo> getStoreInfo( 237 const MachineBasicBlock::iterator &MI) const; 238 239 /// \returns Atomic fence info if \p MI is an atomic fence operation, 240 /// "None" otherwise. 241 Optional<SIMemOpInfo> getAtomicFenceInfo( 242 const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 245 /// rmw operation, "None" otherwise. 246 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 247 const MachineBasicBlock::iterator &MI) const; 248 }; 249 250 class SICacheControl { 251 protected: 252 253 /// Instruction info. 254 const SIInstrInfo *TII = nullptr; 255 256 IsaVersion IV; 257 258 SICacheControl(const GCNSubtarget &ST); 259 260 public: 261 262 /// Create a cache control for the subtarget \p ST. 263 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 264 265 /// Update \p MI memory load instruction to bypass any caches up to 266 /// the \p Scope memory scope for address spaces \p 267 /// AddrSpace. Return true iff the instruction was modified. 268 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 269 SIAtomicScope Scope, 270 SIAtomicAddrSpace AddrSpace) const = 0; 271 272 /// Update \p MI memory instruction to indicate it is 273 /// nontemporal. Return true iff the instruction was modified. 274 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) 275 const = 0; 276 277 /// Inserts any necessary instructions at position \p Pos relative 278 /// to instruction \p MI to ensure any caches associated with 279 /// address spaces \p AddrSpace for memory scopes up to memory scope 280 /// \p Scope are invalidated. Returns true iff any instructions 281 /// inserted. 282 virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 283 SIAtomicScope Scope, 284 SIAtomicAddrSpace AddrSpace, 285 Position Pos) const = 0; 286 287 /// Inserts any necessary instructions at position \p Pos relative 288 /// to instruction \p MI to ensure memory instructions of kind \p Op 289 /// associated with address spaces \p AddrSpace have completed as 290 /// observed by other memory instructions executing in memory scope 291 /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory 292 /// ordering is between address spaces. Returns true iff any 293 /// instructions inserted. 294 virtual bool insertWait(MachineBasicBlock::iterator &MI, 295 SIAtomicScope Scope, 296 SIAtomicAddrSpace AddrSpace, 297 SIMemOp Op, 298 bool IsCrossAddrSpaceOrdering, 299 Position Pos) const = 0; 300 301 /// Virtual destructor to allow derivations to be deleted. 302 virtual ~SICacheControl() = default; 303 304 }; 305 306 class SIGfx6CacheControl : public SICacheControl { 307 protected: 308 309 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 310 /// is modified, false otherwise. 311 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 312 return enableNamedBit<AMDGPU::OpName::glc>(MI); 313 } 314 315 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 316 /// is modified, false otherwise. 317 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 318 return enableNamedBit<AMDGPU::OpName::slc>(MI); 319 } 320 321 public: 322 323 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 324 325 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 326 SIAtomicScope Scope, 327 SIAtomicAddrSpace AddrSpace) const override; 328 329 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 330 331 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 332 SIAtomicScope Scope, 333 SIAtomicAddrSpace AddrSpace, 334 Position Pos) const override; 335 336 bool insertWait(MachineBasicBlock::iterator &MI, 337 SIAtomicScope Scope, 338 SIAtomicAddrSpace AddrSpace, 339 SIMemOp Op, 340 bool IsCrossAddrSpaceOrdering, 341 Position Pos) const override; 342 }; 343 344 class SIGfx7CacheControl : public SIGfx6CacheControl { 345 public: 346 347 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 348 349 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, 350 SIAtomicScope Scope, 351 SIAtomicAddrSpace AddrSpace, 352 Position Pos) const override; 353 354 }; 355 356 class SIMemoryLegalizer final : public MachineFunctionPass { 357 private: 358 359 /// Cache Control. 360 std::unique_ptr<SICacheControl> CC = nullptr; 361 362 /// List of atomic pseudo instructions. 363 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 364 365 /// Return true iff instruction \p MI is a atomic instruction that 366 /// returns a result. 367 bool isAtomicRet(const MachineInstr &MI) const { 368 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 369 } 370 371 /// Removes all processed atomic pseudo instructions from the current 372 /// function. Returns true if current function is modified, false otherwise. 373 bool removeAtomicPseudoMIs(); 374 375 /// Expands load operation \p MI. Returns true if instructions are 376 /// added/deleted or \p MI is modified, false otherwise. 377 bool expandLoad(const SIMemOpInfo &MOI, 378 MachineBasicBlock::iterator &MI); 379 /// Expands store operation \p MI. Returns true if instructions are 380 /// added/deleted or \p MI is modified, false otherwise. 381 bool expandStore(const SIMemOpInfo &MOI, 382 MachineBasicBlock::iterator &MI); 383 /// Expands atomic fence operation \p MI. Returns true if 384 /// instructions are added/deleted or \p MI is modified, false otherwise. 385 bool expandAtomicFence(const SIMemOpInfo &MOI, 386 MachineBasicBlock::iterator &MI); 387 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 388 /// instructions are added/deleted or \p MI is modified, false otherwise. 389 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 390 MachineBasicBlock::iterator &MI); 391 392 public: 393 static char ID; 394 395 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 396 397 void getAnalysisUsage(AnalysisUsage &AU) const override { 398 AU.setPreservesCFG(); 399 MachineFunctionPass::getAnalysisUsage(AU); 400 } 401 402 StringRef getPassName() const override { 403 return PASS_NAME; 404 } 405 406 bool runOnMachineFunction(MachineFunction &MF) override; 407 }; 408 409 } // end namespace anonymous 410 411 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 412 const char *Msg) const { 413 const Function &Func = MI->getParent()->getParent()->getFunction(); 414 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 415 Func.getContext().diagnose(Diag); 416 } 417 418 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 419 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 420 SIAtomicAddrSpace InstrScope) const { 421 /// TODO: For now assume OpenCL memory model which treats each 422 /// address space as having a separate happens-before relation, and 423 /// so an instruction only has ordering with respect to the address 424 /// space it accesses, and if it accesses multiple address spaces it 425 /// does not require ordering of operations in different address 426 /// spaces. 427 if (SSID == SyncScope::System) 428 return std::make_tuple(SIAtomicScope::SYSTEM, 429 SIAtomicAddrSpace::ATOMIC & InstrScope, 430 false); 431 if (SSID == MMI->getAgentSSID()) 432 return std::make_tuple(SIAtomicScope::AGENT, 433 SIAtomicAddrSpace::ATOMIC & InstrScope, 434 false); 435 if (SSID == MMI->getWorkgroupSSID()) 436 return std::make_tuple(SIAtomicScope::WORKGROUP, 437 SIAtomicAddrSpace::ATOMIC & InstrScope, 438 false); 439 if (SSID == MMI->getWavefrontSSID()) 440 return std::make_tuple(SIAtomicScope::WAVEFRONT, 441 SIAtomicAddrSpace::ATOMIC & InstrScope, 442 false); 443 if (SSID == SyncScope::SingleThread) 444 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 445 SIAtomicAddrSpace::ATOMIC & InstrScope, 446 false); 447 /// TODO: To support HSA Memory Model need to add additional memory 448 /// scopes that specify that do require cross address space 449 /// ordering. 450 return None; 451 } 452 453 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 454 if (AS == AMDGPUAS::FLAT_ADDRESS) 455 return SIAtomicAddrSpace::FLAT; 456 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 457 return SIAtomicAddrSpace::GLOBAL; 458 if (AS == AMDGPUAS::LOCAL_ADDRESS) 459 return SIAtomicAddrSpace::LDS; 460 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 461 return SIAtomicAddrSpace::SCRATCH; 462 if (AS == AMDGPUAS::REGION_ADDRESS) 463 return SIAtomicAddrSpace::GDS; 464 465 return SIAtomicAddrSpace::OTHER; 466 } 467 468 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 469 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 470 } 471 472 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 473 const MachineBasicBlock::iterator &MI) const { 474 assert(MI->getNumMemOperands() > 0); 475 476 SyncScope::ID SSID = SyncScope::SingleThread; 477 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 478 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 479 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 480 bool IsNonTemporal = true; 481 482 // Validator should check whether or not MMOs cover the entire set of 483 // locations accessed by the memory instruction. 484 for (const auto &MMO : MI->memoperands()) { 485 IsNonTemporal &= MMO->isNonTemporal(); 486 InstrAddrSpace |= 487 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 488 AtomicOrdering OpOrdering = MMO->getOrdering(); 489 if (OpOrdering != AtomicOrdering::NotAtomic) { 490 const auto &IsSyncScopeInclusion = 491 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 492 if (!IsSyncScopeInclusion) { 493 reportUnsupported(MI, 494 "Unsupported non-inclusive atomic synchronization scope"); 495 return None; 496 } 497 498 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 499 Ordering = 500 isStrongerThan(Ordering, OpOrdering) ? 501 Ordering : MMO->getOrdering(); 502 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 503 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 504 FailureOrdering = 505 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 506 FailureOrdering : MMO->getFailureOrdering(); 507 } 508 } 509 510 SIAtomicScope Scope = SIAtomicScope::NONE; 511 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 512 bool IsCrossAddressSpaceOrdering = false; 513 if (Ordering != AtomicOrdering::NotAtomic) { 514 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 515 if (!ScopeOrNone) { 516 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 517 return None; 518 } 519 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 520 ScopeOrNone.getValue(); 521 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 522 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 523 reportUnsupported(MI, "Unsupported atomic address space"); 524 return None; 525 } 526 } 527 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 528 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); 529 } 530 531 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 532 const MachineBasicBlock::iterator &MI) const { 533 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 534 535 if (!(MI->mayLoad() && !MI->mayStore())) 536 return None; 537 538 // Be conservative if there are no memory operands. 539 if (MI->getNumMemOperands() == 0) 540 return SIMemOpInfo(); 541 542 return constructFromMIWithMMO(MI); 543 } 544 545 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 546 const MachineBasicBlock::iterator &MI) const { 547 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 548 549 if (!(!MI->mayLoad() && MI->mayStore())) 550 return None; 551 552 // Be conservative if there are no memory operands. 553 if (MI->getNumMemOperands() == 0) 554 return SIMemOpInfo(); 555 556 return constructFromMIWithMMO(MI); 557 } 558 559 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 560 const MachineBasicBlock::iterator &MI) const { 561 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 562 563 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 564 return None; 565 566 AtomicOrdering Ordering = 567 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 568 569 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 570 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 571 if (!ScopeOrNone) { 572 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 573 return None; 574 } 575 576 SIAtomicScope Scope = SIAtomicScope::NONE; 577 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 578 bool IsCrossAddressSpaceOrdering = false; 579 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 580 ScopeOrNone.getValue(); 581 582 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 583 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 584 reportUnsupported(MI, "Unsupported atomic address space"); 585 return None; 586 } 587 588 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 589 IsCrossAddressSpaceOrdering); 590 } 591 592 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 593 const MachineBasicBlock::iterator &MI) const { 594 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 595 596 if (!(MI->mayLoad() && MI->mayStore())) 597 return None; 598 599 // Be conservative if there are no memory operands. 600 if (MI->getNumMemOperands() == 0) 601 return SIMemOpInfo(); 602 603 return constructFromMIWithMMO(MI); 604 } 605 606 SICacheControl::SICacheControl(const GCNSubtarget &ST) { 607 TII = ST.getInstrInfo(); 608 IV = getIsaVersion(ST.getCPU()); 609 } 610 611 /* static */ 612 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 613 GCNSubtarget::Generation Generation = ST.getGeneration(); 614 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 615 return make_unique<SIGfx6CacheControl>(ST); 616 return make_unique<SIGfx7CacheControl>(ST); 617 } 618 619 bool SIGfx6CacheControl::enableLoadCacheBypass( 620 const MachineBasicBlock::iterator &MI, 621 SIAtomicScope Scope, 622 SIAtomicAddrSpace AddrSpace) const { 623 assert(MI->mayLoad() && !MI->mayStore()); 624 bool Changed = false; 625 626 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 627 /// TODO: Do not set glc for rmw atomic operations as they 628 /// implicitly bypass the L1 cache. 629 630 switch (Scope) { 631 case SIAtomicScope::SYSTEM: 632 case SIAtomicScope::AGENT: 633 Changed |= enableGLCBit(MI); 634 break; 635 case SIAtomicScope::WORKGROUP: 636 case SIAtomicScope::WAVEFRONT: 637 case SIAtomicScope::SINGLETHREAD: 638 // No cache to bypass. 639 break; 640 default: 641 llvm_unreachable("Unsupported synchronization scope"); 642 } 643 } 644 645 /// The scratch address space does not need the global memory caches 646 /// to be bypassed as all memory operations by the same thread are 647 /// sequentially consistent, and no other thread can access scratch 648 /// memory. 649 650 /// Other address spaces do not hava a cache. 651 652 return Changed; 653 } 654 655 bool SIGfx6CacheControl::enableNonTemporal( 656 const MachineBasicBlock::iterator &MI) const { 657 assert(MI->mayLoad() ^ MI->mayStore()); 658 bool Changed = false; 659 660 /// TODO: Do not enableGLCBit if rmw atomic. 661 Changed |= enableGLCBit(MI); 662 Changed |= enableSLCBit(MI); 663 664 return Changed; 665 } 666 667 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, 668 SIAtomicScope Scope, 669 SIAtomicAddrSpace AddrSpace, 670 Position Pos) const { 671 bool Changed = false; 672 673 MachineBasicBlock &MBB = *MI->getParent(); 674 DebugLoc DL = MI->getDebugLoc(); 675 676 if (Pos == Position::AFTER) 677 ++MI; 678 679 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 680 switch (Scope) { 681 case SIAtomicScope::SYSTEM: 682 case SIAtomicScope::AGENT: 683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 684 Changed = true; 685 break; 686 case SIAtomicScope::WORKGROUP: 687 case SIAtomicScope::WAVEFRONT: 688 case SIAtomicScope::SINGLETHREAD: 689 // No cache to invalidate. 690 break; 691 default: 692 llvm_unreachable("Unsupported synchronization scope"); 693 } 694 } 695 696 /// The scratch address space does not need the global memory cache 697 /// to be flushed as all memory operations by the same thread are 698 /// sequentially consistent, and no other thread can access scratch 699 /// memory. 700 701 /// Other address spaces do not hava a cache. 702 703 if (Pos == Position::AFTER) 704 --MI; 705 706 return Changed; 707 } 708 709 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 710 SIAtomicScope Scope, 711 SIAtomicAddrSpace AddrSpace, 712 SIMemOp Op, 713 bool IsCrossAddrSpaceOrdering, 714 Position Pos) const { 715 bool Changed = false; 716 717 MachineBasicBlock &MBB = *MI->getParent(); 718 DebugLoc DL = MI->getDebugLoc(); 719 720 if (Pos == Position::AFTER) 721 ++MI; 722 723 bool VMCnt = false; 724 bool LGKMCnt = false; 725 bool EXPCnt = false; 726 727 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 728 switch (Scope) { 729 case SIAtomicScope::SYSTEM: 730 case SIAtomicScope::AGENT: 731 VMCnt = true; 732 break; 733 case SIAtomicScope::WORKGROUP: 734 case SIAtomicScope::WAVEFRONT: 735 case SIAtomicScope::SINGLETHREAD: 736 // The L1 cache keeps all memory operations in order for 737 // wavefronts in the same work-group. 738 break; 739 default: 740 llvm_unreachable("Unsupported synchronization scope"); 741 } 742 } 743 744 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 745 switch (Scope) { 746 case SIAtomicScope::SYSTEM: 747 case SIAtomicScope::AGENT: 748 case SIAtomicScope::WORKGROUP: 749 // If no cross address space ordering then an LDS waitcnt is not 750 // needed as LDS operations for all waves are executed in a 751 // total global ordering as observed by all waves. Required if 752 // also synchronizing with global/GDS memory as LDS operations 753 // could be reordered with respect to later global/GDS memory 754 // operations of the same wave. 755 LGKMCnt = IsCrossAddrSpaceOrdering; 756 break; 757 case SIAtomicScope::WAVEFRONT: 758 case SIAtomicScope::SINGLETHREAD: 759 // The LDS keeps all memory operations in order for 760 // the same wavesfront. 761 break; 762 default: 763 llvm_unreachable("Unsupported synchronization scope"); 764 } 765 } 766 767 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 768 switch (Scope) { 769 case SIAtomicScope::SYSTEM: 770 case SIAtomicScope::AGENT: 771 // If no cross address space ordering then an GDS waitcnt is not 772 // needed as GDS operations for all waves are executed in a 773 // total global ordering as observed by all waves. Required if 774 // also synchronizing with global/LDS memory as GDS operations 775 // could be reordered with respect to later global/LDS memory 776 // operations of the same wave. 777 EXPCnt = IsCrossAddrSpaceOrdering; 778 break; 779 case SIAtomicScope::WORKGROUP: 780 case SIAtomicScope::WAVEFRONT: 781 case SIAtomicScope::SINGLETHREAD: 782 // The GDS keeps all memory operations in order for 783 // the same work-group. 784 break; 785 default: 786 llvm_unreachable("Unsupported synchronization scope"); 787 } 788 } 789 790 if (VMCnt || LGKMCnt || EXPCnt) { 791 unsigned WaitCntImmediate = 792 AMDGPU::encodeWaitcnt(IV, 793 VMCnt ? 0 : getVmcntBitMask(IV), 794 EXPCnt ? 0 : getExpcntBitMask(IV), 795 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 796 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 797 Changed = true; 798 } 799 800 if (Pos == Position::AFTER) 801 --MI; 802 803 return Changed; 804 } 805 806 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, 807 SIAtomicScope Scope, 808 SIAtomicAddrSpace AddrSpace, 809 Position Pos) const { 810 bool Changed = false; 811 812 MachineBasicBlock &MBB = *MI->getParent(); 813 DebugLoc DL = MI->getDebugLoc(); 814 815 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 816 817 const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() 818 ? AMDGPU::BUFFER_WBINVL1 819 : AMDGPU::BUFFER_WBINVL1_VOL; 820 821 if (Pos == Position::AFTER) 822 ++MI; 823 824 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 825 switch (Scope) { 826 case SIAtomicScope::SYSTEM: 827 case SIAtomicScope::AGENT: 828 BuildMI(MBB, MI, DL, TII->get(Flush)); 829 Changed = true; 830 break; 831 case SIAtomicScope::WORKGROUP: 832 case SIAtomicScope::WAVEFRONT: 833 case SIAtomicScope::SINGLETHREAD: 834 // No cache to invalidate. 835 break; 836 default: 837 llvm_unreachable("Unsupported synchronization scope"); 838 } 839 } 840 841 /// The scratch address space does not need the global memory cache 842 /// to be flushed as all memory operations by the same thread are 843 /// sequentially consistent, and no other thread can access scratch 844 /// memory. 845 846 /// Other address spaces do not hava a cache. 847 848 if (Pos == Position::AFTER) 849 --MI; 850 851 return Changed; 852 } 853 854 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 855 if (AtomicPseudoMIs.empty()) 856 return false; 857 858 for (auto &MI : AtomicPseudoMIs) 859 MI->eraseFromParent(); 860 861 AtomicPseudoMIs.clear(); 862 return true; 863 } 864 865 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 866 MachineBasicBlock::iterator &MI) { 867 assert(MI->mayLoad() && !MI->mayStore()); 868 869 bool Changed = false; 870 871 if (MOI.isAtomic()) { 872 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 873 MOI.getOrdering() == AtomicOrdering::Acquire || 874 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 875 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 876 MOI.getOrderingAddrSpace()); 877 } 878 879 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 880 Changed |= CC->insertWait(MI, MOI.getScope(), 881 MOI.getOrderingAddrSpace(), 882 SIMemOp::LOAD | SIMemOp::STORE, 883 MOI.getIsCrossAddressSpaceOrdering(), 884 Position::BEFORE); 885 886 if (MOI.getOrdering() == AtomicOrdering::Acquire || 887 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 888 Changed |= CC->insertWait(MI, MOI.getScope(), 889 MOI.getInstrAddrSpace(), 890 SIMemOp::LOAD, 891 MOI.getIsCrossAddressSpaceOrdering(), 892 Position::AFTER); 893 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 894 MOI.getOrderingAddrSpace(), 895 Position::AFTER); 896 } 897 898 return Changed; 899 } 900 901 // Atomic instructions do not have the nontemporal attribute. 902 if (MOI.isNonTemporal()) { 903 Changed |= CC->enableNonTemporal(MI); 904 return Changed; 905 } 906 907 return Changed; 908 } 909 910 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 911 MachineBasicBlock::iterator &MI) { 912 assert(!MI->mayLoad() && MI->mayStore()); 913 914 bool Changed = false; 915 916 if (MOI.isAtomic()) { 917 if (MOI.getOrdering() == AtomicOrdering::Release || 918 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 919 Changed |= CC->insertWait(MI, MOI.getScope(), 920 MOI.getOrderingAddrSpace(), 921 SIMemOp::LOAD | SIMemOp::STORE, 922 MOI.getIsCrossAddressSpaceOrdering(), 923 Position::BEFORE); 924 925 return Changed; 926 } 927 928 // Atomic instructions do not have the nontemporal attribute. 929 if (MOI.isNonTemporal()) { 930 Changed |= CC->enableNonTemporal(MI); 931 return Changed; 932 } 933 934 return Changed; 935 } 936 937 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 938 MachineBasicBlock::iterator &MI) { 939 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 940 941 AtomicPseudoMIs.push_back(MI); 942 bool Changed = false; 943 944 if (MOI.isAtomic()) { 945 if (MOI.getOrdering() == AtomicOrdering::Acquire || 946 MOI.getOrdering() == AtomicOrdering::Release || 947 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 948 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 949 /// TODO: This relies on a barrier always generating a waitcnt 950 /// for LDS to ensure it is not reordered with the completion of 951 /// the proceeding LDS operations. If barrier had a memory 952 /// ordering and memory scope, then library does not need to 953 /// generate a fence. Could add support in this file for 954 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 955 /// adding waitcnt before a S_BARRIER. 956 Changed |= CC->insertWait(MI, MOI.getScope(), 957 MOI.getOrderingAddrSpace(), 958 SIMemOp::LOAD | SIMemOp::STORE, 959 MOI.getIsCrossAddressSpaceOrdering(), 960 Position::BEFORE); 961 962 if (MOI.getOrdering() == AtomicOrdering::Acquire || 963 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 964 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 965 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 966 MOI.getOrderingAddrSpace(), 967 Position::BEFORE); 968 969 return Changed; 970 } 971 972 return Changed; 973 } 974 975 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 976 MachineBasicBlock::iterator &MI) { 977 assert(MI->mayLoad() && MI->mayStore()); 978 979 bool Changed = false; 980 981 if (MOI.isAtomic()) { 982 if (MOI.getOrdering() == AtomicOrdering::Release || 983 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 984 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 985 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 986 Changed |= CC->insertWait(MI, MOI.getScope(), 987 MOI.getOrderingAddrSpace(), 988 SIMemOp::LOAD | SIMemOp::STORE, 989 MOI.getIsCrossAddressSpaceOrdering(), 990 Position::BEFORE); 991 992 if (MOI.getOrdering() == AtomicOrdering::Acquire || 993 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 994 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 995 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 996 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 997 Changed |= CC->insertWait(MI, MOI.getScope(), 998 MOI.getOrderingAddrSpace(), 999 isAtomicRet(*MI) ? SIMemOp::LOAD : 1000 SIMemOp::STORE, 1001 MOI.getIsCrossAddressSpaceOrdering(), 1002 Position::AFTER); 1003 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), 1004 MOI.getOrderingAddrSpace(), 1005 Position::AFTER); 1006 } 1007 1008 return Changed; 1009 } 1010 1011 return Changed; 1012 } 1013 1014 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1015 bool Changed = false; 1016 1017 SIMemOpAccess MOA(MF); 1018 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1019 1020 for (auto &MBB : MF) { 1021 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1022 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1023 continue; 1024 1025 if (const auto &MOI = MOA.getLoadInfo(MI)) 1026 Changed |= expandLoad(MOI.getValue(), MI); 1027 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1028 Changed |= expandStore(MOI.getValue(), MI); 1029 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1030 Changed |= expandAtomicFence(MOI.getValue(), MI); 1031 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1032 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1033 } 1034 } 1035 1036 Changed |= removeAtomicPseudoMIs(); 1037 return Changed; 1038 } 1039 1040 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1041 1042 char SIMemoryLegalizer::ID = 0; 1043 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1044 1045 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1046 return new SIMemoryLegalizer(); 1047 } 1048