1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIDefines.h" 20 #include "SIInstrInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/None.h" 25 #include "llvm/ADT/Optional.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineMemOperand.h" 31 #include "llvm/CodeGen/MachineModuleInfo.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/LLVMContext.h" 37 #include "llvm/MC/MCInstrDesc.h" 38 #include "llvm/Pass.h" 39 #include "llvm/Support/AtomicOrdering.h" 40 #include "llvm/Support/MathExtras.h" 41 #include <cassert> 42 #include <list> 43 44 using namespace llvm; 45 using namespace llvm::AMDGPU; 46 47 #define DEBUG_TYPE "si-memory-legalizer" 48 #define PASS_NAME "SI Memory Legalizer" 49 50 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 51 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 52 cl::desc("Use this to skip inserting cache invalidating instructions.")); 53 54 namespace { 55 56 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 57 58 /// Memory operation flags. Can be ORed together. 59 enum class SIMemOp { 60 NONE = 0u, 61 LOAD = 1u << 0, 62 STORE = 1u << 1, 63 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 64 }; 65 66 /// Position to insert a new instruction relative to an existing 67 /// instruction. 68 enum class Position { 69 BEFORE, 70 AFTER 71 }; 72 73 /// The atomic synchronization scopes supported by the AMDGPU target. 74 enum class SIAtomicScope { 75 NONE, 76 SINGLETHREAD, 77 WAVEFRONT, 78 WORKGROUP, 79 AGENT, 80 SYSTEM 81 }; 82 83 /// The distinct address spaces supported by the AMDGPU target for 84 /// atomic memory operation. Can be ORed toether. 85 enum class SIAtomicAddrSpace { 86 NONE = 0u, 87 GLOBAL = 1u << 0, 88 LDS = 1u << 1, 89 SCRATCH = 1u << 2, 90 GDS = 1u << 3, 91 OTHER = 1u << 4, 92 93 /// The address spaces that can be accessed by a FLAT instruction. 94 FLAT = GLOBAL | LDS | SCRATCH, 95 96 /// The address spaces that support atomic instructions. 97 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 98 99 /// All address spaces. 100 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 101 102 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 103 }; 104 105 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 106 /// \returns Returns true if \p MI is modified, false otherwise. 107 template <uint16_t BitName> 108 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 109 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 110 if (BitIdx == -1) 111 return false; 112 113 MachineOperand &Bit = MI->getOperand(BitIdx); 114 if (Bit.getImm() != 0) 115 return false; 116 117 Bit.setImm(1); 118 return true; 119 } 120 121 class SIMemOpInfo final { 122 private: 123 124 friend class SIMemOpAccess; 125 126 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 127 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 128 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 129 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 130 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 131 bool IsCrossAddressSpaceOrdering = false; 132 bool IsNonTemporal = false; 133 134 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 135 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 136 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 137 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 138 bool IsCrossAddressSpaceOrdering = true, 139 AtomicOrdering FailureOrdering = 140 AtomicOrdering::SequentiallyConsistent, 141 bool IsNonTemporal = false) 142 : Ordering(Ordering), FailureOrdering(FailureOrdering), 143 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 144 InstrAddrSpace(InstrAddrSpace), 145 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 146 IsNonTemporal(IsNonTemporal) { 147 // There is also no cross address space ordering if the ordering 148 // address space is the same as the instruction address space and 149 // only contains a single address space. 150 if ((OrderingAddrSpace == InstrAddrSpace) && 151 isPowerOf2_32(uint32_t(InstrAddrSpace))) 152 this->IsCrossAddressSpaceOrdering = false; 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is non-temporal, false otherwise. 194 bool isNonTemporal() const { 195 return IsNonTemporal; 196 } 197 198 /// \returns True if ordering constraint of the machine instruction used to 199 /// create this SIMemOpInfo is unordered or higher, false otherwise. 200 bool isAtomic() const { 201 return Ordering != AtomicOrdering::NotAtomic; 202 } 203 204 }; 205 206 class SIMemOpAccess final { 207 private: 208 AMDGPUMachineModuleInfo *MMI = nullptr; 209 210 /// Reports unsupported message \p Msg for \p MI to LLVM context. 211 void reportUnsupported(const MachineBasicBlock::iterator &MI, 212 const char *Msg) const; 213 214 /// Inspects the target synchonization scope \p SSID and determines 215 /// the SI atomic scope it corresponds to, the address spaces it 216 /// covers, and whether the memory ordering applies between address 217 /// spaces. 218 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 219 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 220 221 /// \return Return a bit set of the address spaces accessed by \p AS. 222 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 223 224 /// \returns Info constructed from \p MI, which has at least machine memory 225 /// operand. 226 Optional<SIMemOpInfo> constructFromMIWithMMO( 227 const MachineBasicBlock::iterator &MI) const; 228 229 public: 230 /// Construct class to support accessing the machine memory operands 231 /// of instructions in the machine function \p MF. 232 SIMemOpAccess(MachineFunction &MF); 233 234 /// \returns Load info if \p MI is a load operation, "None" otherwise. 235 Optional<SIMemOpInfo> getLoadInfo( 236 const MachineBasicBlock::iterator &MI) const; 237 238 /// \returns Store info if \p MI is a store operation, "None" otherwise. 239 Optional<SIMemOpInfo> getStoreInfo( 240 const MachineBasicBlock::iterator &MI) const; 241 242 /// \returns Atomic fence info if \p MI is an atomic fence operation, 243 /// "None" otherwise. 244 Optional<SIMemOpInfo> getAtomicFenceInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 248 /// rmw operation, "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 }; 252 253 class SICacheControl { 254 protected: 255 256 /// Instruction info. 257 const SIInstrInfo *TII = nullptr; 258 259 IsaVersion IV; 260 261 /// Whether to insert cache invalidating instructions. 262 bool InsertCacheInv; 263 264 SICacheControl(const GCNSubtarget &ST); 265 266 public: 267 268 /// Create a cache control for the subtarget \p ST. 269 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 270 271 /// Update \p MI memory load instruction to bypass any caches up to 272 /// the \p Scope memory scope for address spaces \p 273 /// AddrSpace. Return true iff the instruction was modified. 274 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 275 SIAtomicScope Scope, 276 SIAtomicAddrSpace AddrSpace) const = 0; 277 278 /// Update \p MI memory instruction to indicate it is 279 /// nontemporal. Return true iff the instruction was modified. 280 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) 281 const = 0; 282 283 /// Inserts any necessary instructions at position \p Pos relative to 284 /// instruction \p MI to ensure any subsequent memory instructions of this 285 /// thread with address spaces \p AddrSpace will observe the previous memory 286 /// operations by any thread for memory scopes up to memory scope \p Scope . 287 /// Returns true iff any instructions inserted. 288 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 289 SIAtomicScope Scope, 290 SIAtomicAddrSpace AddrSpace, 291 Position Pos) const = 0; 292 293 /// Inserts any necessary instructions at position \p Pos relative 294 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 295 /// \p Op associated with address spaces \p AddrSpace have completed. Used 296 /// between memory instructions to enforce the order they become visible as 297 /// observed by other memory instructions executing in memory scope \p Scope. 298 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 299 /// address spaces. Returns true iff any instructions inserted. 300 virtual bool insertWait(MachineBasicBlock::iterator &MI, 301 SIAtomicScope Scope, 302 SIAtomicAddrSpace AddrSpace, 303 SIMemOp Op, 304 bool IsCrossAddrSpaceOrdering, 305 Position Pos) const = 0; 306 307 /// Inserts any necessary instructions at position \p Pos relative to 308 /// instruction \p MI to ensure previous memory instructions by this thread 309 /// with address spaces \p AddrSpace have completed and can be observed by 310 /// subsequent memory instructions by any thread executing in memory scope \p 311 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 312 /// between address spaces. Returns true iff any instructions inserted. 313 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 314 SIAtomicScope Scope, 315 SIAtomicAddrSpace AddrSpace, 316 bool IsCrossAddrSpaceOrdering, 317 Position Pos) const = 0; 318 319 /// Virtual destructor to allow derivations to be deleted. 320 virtual ~SICacheControl() = default; 321 322 }; 323 324 class SIGfx6CacheControl : public SICacheControl { 325 protected: 326 327 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 328 /// is modified, false otherwise. 329 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 330 return enableNamedBit<AMDGPU::OpName::glc>(MI); 331 } 332 333 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 334 /// is modified, false otherwise. 335 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 336 return enableNamedBit<AMDGPU::OpName::slc>(MI); 337 } 338 339 public: 340 341 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 342 343 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace) const override; 346 347 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 348 349 bool insertAcquire(MachineBasicBlock::iterator &MI, 350 SIAtomicScope Scope, 351 SIAtomicAddrSpace AddrSpace, 352 Position Pos) const override; 353 354 bool insertRelease(MachineBasicBlock::iterator &MI, 355 SIAtomicScope Scope, 356 SIAtomicAddrSpace AddrSpace, 357 bool IsCrossAddrSpaceOrdering, 358 Position Pos) const override; 359 360 bool insertWait(MachineBasicBlock::iterator &MI, 361 SIAtomicScope Scope, 362 SIAtomicAddrSpace AddrSpace, 363 SIMemOp Op, 364 bool IsCrossAddrSpaceOrdering, 365 Position Pos) const override; 366 }; 367 368 class SIGfx7CacheControl : public SIGfx6CacheControl { 369 public: 370 371 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 372 373 bool insertAcquire(MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace, 376 Position Pos) const override; 377 378 }; 379 380 class SIGfx10CacheControl : public SIGfx7CacheControl { 381 protected: 382 bool CuMode = false; 383 384 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 385 /// is modified, false otherwise. 386 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 387 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 388 } 389 390 public: 391 392 SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : 393 SIGfx7CacheControl(ST), CuMode(CuMode) {}; 394 395 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 396 SIAtomicScope Scope, 397 SIAtomicAddrSpace AddrSpace) const override; 398 399 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 400 401 bool insertAcquire(MachineBasicBlock::iterator &MI, 402 SIAtomicScope Scope, 403 SIAtomicAddrSpace AddrSpace, 404 Position Pos) const override; 405 406 bool insertWait(MachineBasicBlock::iterator &MI, 407 SIAtomicScope Scope, 408 SIAtomicAddrSpace AddrSpace, 409 SIMemOp Op, 410 bool IsCrossAddrSpaceOrdering, 411 Position Pos) const override; 412 }; 413 414 class SIMemoryLegalizer final : public MachineFunctionPass { 415 private: 416 417 /// Cache Control. 418 std::unique_ptr<SICacheControl> CC = nullptr; 419 420 /// List of atomic pseudo instructions. 421 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 422 423 /// Return true iff instruction \p MI is a atomic instruction that 424 /// returns a result. 425 bool isAtomicRet(const MachineInstr &MI) const { 426 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 427 } 428 429 /// Removes all processed atomic pseudo instructions from the current 430 /// function. Returns true if current function is modified, false otherwise. 431 bool removeAtomicPseudoMIs(); 432 433 /// Expands load operation \p MI. Returns true if instructions are 434 /// added/deleted or \p MI is modified, false otherwise. 435 bool expandLoad(const SIMemOpInfo &MOI, 436 MachineBasicBlock::iterator &MI); 437 /// Expands store operation \p MI. Returns true if instructions are 438 /// added/deleted or \p MI is modified, false otherwise. 439 bool expandStore(const SIMemOpInfo &MOI, 440 MachineBasicBlock::iterator &MI); 441 /// Expands atomic fence operation \p MI. Returns true if 442 /// instructions are added/deleted or \p MI is modified, false otherwise. 443 bool expandAtomicFence(const SIMemOpInfo &MOI, 444 MachineBasicBlock::iterator &MI); 445 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 446 /// instructions are added/deleted or \p MI is modified, false otherwise. 447 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 448 MachineBasicBlock::iterator &MI); 449 450 public: 451 static char ID; 452 453 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 454 455 void getAnalysisUsage(AnalysisUsage &AU) const override { 456 AU.setPreservesCFG(); 457 MachineFunctionPass::getAnalysisUsage(AU); 458 } 459 460 StringRef getPassName() const override { 461 return PASS_NAME; 462 } 463 464 bool runOnMachineFunction(MachineFunction &MF) override; 465 }; 466 467 } // end namespace anonymous 468 469 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 470 const char *Msg) const { 471 const Function &Func = MI->getParent()->getParent()->getFunction(); 472 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 473 Func.getContext().diagnose(Diag); 474 } 475 476 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 477 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 478 SIAtomicAddrSpace InstrScope) const { 479 if (SSID == SyncScope::System) 480 return std::make_tuple(SIAtomicScope::SYSTEM, 481 SIAtomicAddrSpace::ATOMIC, 482 true); 483 if (SSID == MMI->getAgentSSID()) 484 return std::make_tuple(SIAtomicScope::AGENT, 485 SIAtomicAddrSpace::ATOMIC, 486 true); 487 if (SSID == MMI->getWorkgroupSSID()) 488 return std::make_tuple(SIAtomicScope::WORKGROUP, 489 SIAtomicAddrSpace::ATOMIC, 490 true); 491 if (SSID == MMI->getWavefrontSSID()) 492 return std::make_tuple(SIAtomicScope::WAVEFRONT, 493 SIAtomicAddrSpace::ATOMIC, 494 true); 495 if (SSID == SyncScope::SingleThread) 496 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 497 SIAtomicAddrSpace::ATOMIC, 498 true); 499 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 500 return std::make_tuple(SIAtomicScope::SYSTEM, 501 SIAtomicAddrSpace::ATOMIC & InstrScope, 502 false); 503 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 504 return std::make_tuple(SIAtomicScope::AGENT, 505 SIAtomicAddrSpace::ATOMIC & InstrScope, 506 false); 507 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 508 return std::make_tuple(SIAtomicScope::WORKGROUP, 509 SIAtomicAddrSpace::ATOMIC & InstrScope, 510 false); 511 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 512 return std::make_tuple(SIAtomicScope::WAVEFRONT, 513 SIAtomicAddrSpace::ATOMIC & InstrScope, 514 false); 515 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 516 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 517 SIAtomicAddrSpace::ATOMIC & InstrScope, 518 false); 519 return None; 520 } 521 522 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 523 if (AS == AMDGPUAS::FLAT_ADDRESS) 524 return SIAtomicAddrSpace::FLAT; 525 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 526 return SIAtomicAddrSpace::GLOBAL; 527 if (AS == AMDGPUAS::LOCAL_ADDRESS) 528 return SIAtomicAddrSpace::LDS; 529 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 530 return SIAtomicAddrSpace::SCRATCH; 531 if (AS == AMDGPUAS::REGION_ADDRESS) 532 return SIAtomicAddrSpace::GDS; 533 534 return SIAtomicAddrSpace::OTHER; 535 } 536 537 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 538 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 539 } 540 541 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 542 const MachineBasicBlock::iterator &MI) const { 543 assert(MI->getNumMemOperands() > 0); 544 545 SyncScope::ID SSID = SyncScope::SingleThread; 546 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 547 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 548 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 549 bool IsNonTemporal = true; 550 551 // Validator should check whether or not MMOs cover the entire set of 552 // locations accessed by the memory instruction. 553 for (const auto &MMO : MI->memoperands()) { 554 IsNonTemporal &= MMO->isNonTemporal(); 555 InstrAddrSpace |= 556 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 557 AtomicOrdering OpOrdering = MMO->getOrdering(); 558 if (OpOrdering != AtomicOrdering::NotAtomic) { 559 const auto &IsSyncScopeInclusion = 560 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 561 if (!IsSyncScopeInclusion) { 562 reportUnsupported(MI, 563 "Unsupported non-inclusive atomic synchronization scope"); 564 return None; 565 } 566 567 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 568 Ordering = 569 isStrongerThan(Ordering, OpOrdering) ? 570 Ordering : MMO->getOrdering(); 571 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 572 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 573 FailureOrdering = 574 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 575 FailureOrdering : MMO->getFailureOrdering(); 576 } 577 } 578 579 SIAtomicScope Scope = SIAtomicScope::NONE; 580 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 581 bool IsCrossAddressSpaceOrdering = false; 582 if (Ordering != AtomicOrdering::NotAtomic) { 583 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 584 if (!ScopeOrNone) { 585 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 586 return None; 587 } 588 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 589 ScopeOrNone.getValue(); 590 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 591 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 592 reportUnsupported(MI, "Unsupported atomic address space"); 593 return None; 594 } 595 } 596 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 597 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); 598 } 599 600 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 601 const MachineBasicBlock::iterator &MI) const { 602 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 603 604 if (!(MI->mayLoad() && !MI->mayStore())) 605 return None; 606 607 // Be conservative if there are no memory operands. 608 if (MI->getNumMemOperands() == 0) 609 return SIMemOpInfo(); 610 611 return constructFromMIWithMMO(MI); 612 } 613 614 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 615 const MachineBasicBlock::iterator &MI) const { 616 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 617 618 if (!(!MI->mayLoad() && MI->mayStore())) 619 return None; 620 621 // Be conservative if there are no memory operands. 622 if (MI->getNumMemOperands() == 0) 623 return SIMemOpInfo(); 624 625 return constructFromMIWithMMO(MI); 626 } 627 628 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 629 const MachineBasicBlock::iterator &MI) const { 630 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 631 632 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 633 return None; 634 635 AtomicOrdering Ordering = 636 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 637 638 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 639 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 640 if (!ScopeOrNone) { 641 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 642 return None; 643 } 644 645 SIAtomicScope Scope = SIAtomicScope::NONE; 646 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 647 bool IsCrossAddressSpaceOrdering = false; 648 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 649 ScopeOrNone.getValue(); 650 651 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 652 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 653 reportUnsupported(MI, "Unsupported atomic address space"); 654 return None; 655 } 656 657 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 658 IsCrossAddressSpaceOrdering); 659 } 660 661 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 662 const MachineBasicBlock::iterator &MI) const { 663 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 664 665 if (!(MI->mayLoad() && MI->mayStore())) 666 return None; 667 668 // Be conservative if there are no memory operands. 669 if (MI->getNumMemOperands() == 0) 670 return SIMemOpInfo(); 671 672 return constructFromMIWithMMO(MI); 673 } 674 675 SICacheControl::SICacheControl(const GCNSubtarget &ST) { 676 TII = ST.getInstrInfo(); 677 IV = getIsaVersion(ST.getCPU()); 678 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 679 } 680 681 /* static */ 682 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 683 GCNSubtarget::Generation Generation = ST.getGeneration(); 684 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 685 return std::make_unique<SIGfx6CacheControl>(ST); 686 if (Generation < AMDGPUSubtarget::GFX10) 687 return std::make_unique<SIGfx7CacheControl>(ST); 688 return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); 689 } 690 691 bool SIGfx6CacheControl::enableLoadCacheBypass( 692 const MachineBasicBlock::iterator &MI, 693 SIAtomicScope Scope, 694 SIAtomicAddrSpace AddrSpace) const { 695 assert(MI->mayLoad() && !MI->mayStore()); 696 bool Changed = false; 697 698 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 699 switch (Scope) { 700 case SIAtomicScope::SYSTEM: 701 case SIAtomicScope::AGENT: 702 Changed |= enableGLCBit(MI); 703 break; 704 case SIAtomicScope::WORKGROUP: 705 case SIAtomicScope::WAVEFRONT: 706 case SIAtomicScope::SINGLETHREAD: 707 // No cache to bypass. 708 break; 709 default: 710 llvm_unreachable("Unsupported synchronization scope"); 711 } 712 } 713 714 /// The scratch address space does not need the global memory caches 715 /// to be bypassed as all memory operations by the same thread are 716 /// sequentially consistent, and no other thread can access scratch 717 /// memory. 718 719 /// Other address spaces do not have a cache. 720 721 return Changed; 722 } 723 724 bool SIGfx6CacheControl::enableNonTemporal( 725 const MachineBasicBlock::iterator &MI) const { 726 assert(MI->mayLoad() ^ MI->mayStore()); 727 bool Changed = false; 728 729 /// TODO: Do not enableGLCBit if rmw atomic. 730 Changed |= enableGLCBit(MI); 731 Changed |= enableSLCBit(MI); 732 733 return Changed; 734 } 735 736 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 737 SIAtomicScope Scope, 738 SIAtomicAddrSpace AddrSpace, 739 Position Pos) const { 740 if (!InsertCacheInv) 741 return false; 742 743 bool Changed = false; 744 745 MachineBasicBlock &MBB = *MI->getParent(); 746 DebugLoc DL = MI->getDebugLoc(); 747 748 if (Pos == Position::AFTER) 749 ++MI; 750 751 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 752 switch (Scope) { 753 case SIAtomicScope::SYSTEM: 754 case SIAtomicScope::AGENT: 755 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 756 Changed = true; 757 break; 758 case SIAtomicScope::WORKGROUP: 759 case SIAtomicScope::WAVEFRONT: 760 case SIAtomicScope::SINGLETHREAD: 761 // No cache to invalidate. 762 break; 763 default: 764 llvm_unreachable("Unsupported synchronization scope"); 765 } 766 } 767 768 /// The scratch address space does not need the global memory cache 769 /// to be flushed as all memory operations by the same thread are 770 /// sequentially consistent, and no other thread can access scratch 771 /// memory. 772 773 /// Other address spaces do not have a cache. 774 775 if (Pos == Position::AFTER) 776 --MI; 777 778 return Changed; 779 } 780 781 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 782 SIAtomicScope Scope, 783 SIAtomicAddrSpace AddrSpace, 784 SIMemOp Op, 785 bool IsCrossAddrSpaceOrdering, 786 Position Pos) const { 787 bool Changed = false; 788 789 MachineBasicBlock &MBB = *MI->getParent(); 790 DebugLoc DL = MI->getDebugLoc(); 791 792 if (Pos == Position::AFTER) 793 ++MI; 794 795 bool VMCnt = false; 796 bool LGKMCnt = false; 797 798 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 799 switch (Scope) { 800 case SIAtomicScope::SYSTEM: 801 case SIAtomicScope::AGENT: 802 VMCnt |= true; 803 break; 804 case SIAtomicScope::WORKGROUP: 805 case SIAtomicScope::WAVEFRONT: 806 case SIAtomicScope::SINGLETHREAD: 807 // The L1 cache keeps all memory operations in order for 808 // wavefronts in the same work-group. 809 break; 810 default: 811 llvm_unreachable("Unsupported synchronization scope"); 812 } 813 } 814 815 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 816 switch (Scope) { 817 case SIAtomicScope::SYSTEM: 818 case SIAtomicScope::AGENT: 819 case SIAtomicScope::WORKGROUP: 820 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 821 // not needed as LDS operations for all waves are executed in a total 822 // global ordering as observed by all waves. Required if also 823 // synchronizing with global/GDS memory as LDS operations could be 824 // reordered with respect to later global/GDS memory operations of the 825 // same wave. 826 LGKMCnt |= IsCrossAddrSpaceOrdering; 827 break; 828 case SIAtomicScope::WAVEFRONT: 829 case SIAtomicScope::SINGLETHREAD: 830 // The LDS keeps all memory operations in order for 831 // the same wavesfront. 832 break; 833 default: 834 llvm_unreachable("Unsupported synchronization scope"); 835 } 836 } 837 838 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 839 switch (Scope) { 840 case SIAtomicScope::SYSTEM: 841 case SIAtomicScope::AGENT: 842 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 843 // is not needed as GDS operations for all waves are executed in a total 844 // global ordering as observed by all waves. Required if also 845 // synchronizing with global/LDS memory as GDS operations could be 846 // reordered with respect to later global/LDS memory operations of the 847 // same wave. 848 LGKMCnt |= IsCrossAddrSpaceOrdering; 849 break; 850 case SIAtomicScope::WORKGROUP: 851 case SIAtomicScope::WAVEFRONT: 852 case SIAtomicScope::SINGLETHREAD: 853 // The GDS keeps all memory operations in order for 854 // the same work-group. 855 break; 856 default: 857 llvm_unreachable("Unsupported synchronization scope"); 858 } 859 } 860 861 if (VMCnt || LGKMCnt) { 862 unsigned WaitCntImmediate = 863 AMDGPU::encodeWaitcnt(IV, 864 VMCnt ? 0 : getVmcntBitMask(IV), 865 getExpcntBitMask(IV), 866 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 867 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 868 Changed = true; 869 } 870 871 if (Pos == Position::AFTER) 872 --MI; 873 874 return Changed; 875 } 876 877 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 878 SIAtomicScope Scope, 879 SIAtomicAddrSpace AddrSpace, 880 bool IsCrossAddrSpaceOrdering, 881 Position Pos) const { 882 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 883 IsCrossAddrSpaceOrdering, Pos); 884 } 885 886 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 887 SIAtomicScope Scope, 888 SIAtomicAddrSpace AddrSpace, 889 Position Pos) const { 890 if (!InsertCacheInv) 891 return false; 892 893 bool Changed = false; 894 895 MachineBasicBlock &MBB = *MI->getParent(); 896 DebugLoc DL = MI->getDebugLoc(); 897 898 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 899 900 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 901 ? AMDGPU::BUFFER_WBINVL1 902 : AMDGPU::BUFFER_WBINVL1_VOL; 903 904 if (Pos == Position::AFTER) 905 ++MI; 906 907 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 908 switch (Scope) { 909 case SIAtomicScope::SYSTEM: 910 case SIAtomicScope::AGENT: 911 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 912 Changed = true; 913 break; 914 case SIAtomicScope::WORKGROUP: 915 case SIAtomicScope::WAVEFRONT: 916 case SIAtomicScope::SINGLETHREAD: 917 // No cache to invalidate. 918 break; 919 default: 920 llvm_unreachable("Unsupported synchronization scope"); 921 } 922 } 923 924 /// The scratch address space does not need the global memory cache 925 /// to be flushed as all memory operations by the same thread are 926 /// sequentially consistent, and no other thread can access scratch 927 /// memory. 928 929 /// Other address spaces do not have a cache. 930 931 if (Pos == Position::AFTER) 932 --MI; 933 934 return Changed; 935 } 936 937 bool SIGfx10CacheControl::enableLoadCacheBypass( 938 const MachineBasicBlock::iterator &MI, 939 SIAtomicScope Scope, 940 SIAtomicAddrSpace AddrSpace) const { 941 assert(MI->mayLoad() && !MI->mayStore()); 942 bool Changed = false; 943 944 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 945 /// TODO Do not set glc for rmw atomic operations as they 946 /// implicitly bypass the L0/L1 caches. 947 948 switch (Scope) { 949 case SIAtomicScope::SYSTEM: 950 case SIAtomicScope::AGENT: 951 Changed |= enableGLCBit(MI); 952 Changed |= enableDLCBit(MI); 953 break; 954 case SIAtomicScope::WORKGROUP: 955 // In WGP mode the waves of a work-group can be executing on either CU of 956 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 957 // CU mode all waves of a work-group are on the same CU, and so the L0 958 // does not need to be bypassed. 959 if (!CuMode) Changed |= enableGLCBit(MI); 960 break; 961 case SIAtomicScope::WAVEFRONT: 962 case SIAtomicScope::SINGLETHREAD: 963 // No cache to bypass. 964 break; 965 default: 966 llvm_unreachable("Unsupported synchronization scope"); 967 } 968 } 969 970 /// The scratch address space does not need the global memory caches 971 /// to be bypassed as all memory operations by the same thread are 972 /// sequentially consistent, and no other thread can access scratch 973 /// memory. 974 975 /// Other address spaces do not have a cache. 976 977 return Changed; 978 } 979 980 bool SIGfx10CacheControl::enableNonTemporal( 981 const MachineBasicBlock::iterator &MI) const { 982 assert(MI->mayLoad() ^ MI->mayStore()); 983 bool Changed = false; 984 985 Changed |= enableSLCBit(MI); 986 /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) 987 988 return Changed; 989 } 990 991 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 992 SIAtomicScope Scope, 993 SIAtomicAddrSpace AddrSpace, 994 Position Pos) const { 995 if (!InsertCacheInv) 996 return false; 997 998 bool Changed = false; 999 1000 MachineBasicBlock &MBB = *MI->getParent(); 1001 DebugLoc DL = MI->getDebugLoc(); 1002 1003 if (Pos == Position::AFTER) 1004 ++MI; 1005 1006 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1007 switch (Scope) { 1008 case SIAtomicScope::SYSTEM: 1009 case SIAtomicScope::AGENT: 1010 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1011 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1012 Changed = true; 1013 break; 1014 case SIAtomicScope::WORKGROUP: 1015 // In WGP mode the waves of a work-group can be executing on either CU of 1016 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1017 // in CU mode and all waves of a work-group are on the same CU, and so the 1018 // L0 does not need to be invalidated. 1019 if (!CuMode) { 1020 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1021 Changed = true; 1022 } 1023 break; 1024 case SIAtomicScope::WAVEFRONT: 1025 case SIAtomicScope::SINGLETHREAD: 1026 // No cache to invalidate. 1027 break; 1028 default: 1029 llvm_unreachable("Unsupported synchronization scope"); 1030 } 1031 } 1032 1033 /// The scratch address space does not need the global memory cache 1034 /// to be flushed as all memory operations by the same thread are 1035 /// sequentially consistent, and no other thread can access scratch 1036 /// memory. 1037 1038 /// Other address spaces do not have a cache. 1039 1040 if (Pos == Position::AFTER) 1041 --MI; 1042 1043 return Changed; 1044 } 1045 1046 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1047 SIAtomicScope Scope, 1048 SIAtomicAddrSpace AddrSpace, 1049 SIMemOp Op, 1050 bool IsCrossAddrSpaceOrdering, 1051 Position Pos) const { 1052 bool Changed = false; 1053 1054 MachineBasicBlock &MBB = *MI->getParent(); 1055 DebugLoc DL = MI->getDebugLoc(); 1056 1057 if (Pos == Position::AFTER) 1058 ++MI; 1059 1060 bool VMCnt = false; 1061 bool VSCnt = false; 1062 bool LGKMCnt = false; 1063 1064 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1065 switch (Scope) { 1066 case SIAtomicScope::SYSTEM: 1067 case SIAtomicScope::AGENT: 1068 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1069 VMCnt |= true; 1070 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1071 VSCnt |= true; 1072 break; 1073 case SIAtomicScope::WORKGROUP: 1074 // In WGP mode the waves of a work-group can be executing on either CU of 1075 // the WGP. Therefore need to wait for operations to complete to ensure 1076 // they are visible to waves in the other CU as the L0 is per CU. 1077 // Otherwise in CU mode and all waves of a work-group are on the same CU 1078 // which shares the same L0. 1079 if (!CuMode) { 1080 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1081 VMCnt |= true; 1082 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1083 VSCnt |= true; 1084 } 1085 break; 1086 case SIAtomicScope::WAVEFRONT: 1087 case SIAtomicScope::SINGLETHREAD: 1088 // The L0 cache keeps all memory operations in order for 1089 // work-items in the same wavefront. 1090 break; 1091 default: 1092 llvm_unreachable("Unsupported synchronization scope"); 1093 } 1094 } 1095 1096 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1097 switch (Scope) { 1098 case SIAtomicScope::SYSTEM: 1099 case SIAtomicScope::AGENT: 1100 case SIAtomicScope::WORKGROUP: 1101 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1102 // not needed as LDS operations for all waves are executed in a total 1103 // global ordering as observed by all waves. Required if also 1104 // synchronizing with global/GDS memory as LDS operations could be 1105 // reordered with respect to later global/GDS memory operations of the 1106 // same wave. 1107 LGKMCnt |= IsCrossAddrSpaceOrdering; 1108 break; 1109 case SIAtomicScope::WAVEFRONT: 1110 case SIAtomicScope::SINGLETHREAD: 1111 // The LDS keeps all memory operations in order for 1112 // the same wavesfront. 1113 break; 1114 default: 1115 llvm_unreachable("Unsupported synchronization scope"); 1116 } 1117 } 1118 1119 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1120 switch (Scope) { 1121 case SIAtomicScope::SYSTEM: 1122 case SIAtomicScope::AGENT: 1123 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1124 // is not needed as GDS operations for all waves are executed in a total 1125 // global ordering as observed by all waves. Required if also 1126 // synchronizing with global/LDS memory as GDS operations could be 1127 // reordered with respect to later global/LDS memory operations of the 1128 // same wave. 1129 LGKMCnt |= IsCrossAddrSpaceOrdering; 1130 break; 1131 case SIAtomicScope::WORKGROUP: 1132 case SIAtomicScope::WAVEFRONT: 1133 case SIAtomicScope::SINGLETHREAD: 1134 // The GDS keeps all memory operations in order for 1135 // the same work-group. 1136 break; 1137 default: 1138 llvm_unreachable("Unsupported synchronization scope"); 1139 } 1140 } 1141 1142 if (VMCnt || LGKMCnt) { 1143 unsigned WaitCntImmediate = 1144 AMDGPU::encodeWaitcnt(IV, 1145 VMCnt ? 0 : getVmcntBitMask(IV), 1146 getExpcntBitMask(IV), 1147 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1148 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1149 Changed = true; 1150 } 1151 1152 if (VSCnt) { 1153 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1154 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1155 .addImm(0); 1156 Changed = true; 1157 } 1158 1159 if (Pos == Position::AFTER) 1160 --MI; 1161 1162 return Changed; 1163 } 1164 1165 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1166 if (AtomicPseudoMIs.empty()) 1167 return false; 1168 1169 for (auto &MI : AtomicPseudoMIs) 1170 MI->eraseFromParent(); 1171 1172 AtomicPseudoMIs.clear(); 1173 return true; 1174 } 1175 1176 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1177 MachineBasicBlock::iterator &MI) { 1178 assert(MI->mayLoad() && !MI->mayStore()); 1179 1180 bool Changed = false; 1181 1182 if (MOI.isAtomic()) { 1183 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1184 MOI.getOrdering() == AtomicOrdering::Acquire || 1185 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1186 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1187 MOI.getOrderingAddrSpace()); 1188 } 1189 1190 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1191 Changed |= CC->insertWait(MI, MOI.getScope(), 1192 MOI.getOrderingAddrSpace(), 1193 SIMemOp::LOAD | SIMemOp::STORE, 1194 MOI.getIsCrossAddressSpaceOrdering(), 1195 Position::BEFORE); 1196 1197 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1198 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1199 Changed |= CC->insertWait(MI, MOI.getScope(), 1200 MOI.getInstrAddrSpace(), 1201 SIMemOp::LOAD, 1202 MOI.getIsCrossAddressSpaceOrdering(), 1203 Position::AFTER); 1204 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1205 MOI.getOrderingAddrSpace(), 1206 Position::AFTER); 1207 } 1208 1209 return Changed; 1210 } 1211 1212 // Atomic instructions do not have the nontemporal attribute. 1213 if (MOI.isNonTemporal()) { 1214 Changed |= CC->enableNonTemporal(MI); 1215 return Changed; 1216 } 1217 1218 return Changed; 1219 } 1220 1221 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1222 MachineBasicBlock::iterator &MI) { 1223 assert(!MI->mayLoad() && MI->mayStore()); 1224 1225 bool Changed = false; 1226 1227 if (MOI.isAtomic()) { 1228 if (MOI.getOrdering() == AtomicOrdering::Release || 1229 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1230 Changed |= CC->insertRelease(MI, MOI.getScope(), 1231 MOI.getOrderingAddrSpace(), 1232 MOI.getIsCrossAddressSpaceOrdering(), 1233 Position::BEFORE); 1234 1235 return Changed; 1236 } 1237 1238 // Atomic instructions do not have the nontemporal attribute. 1239 if (MOI.isNonTemporal()) { 1240 Changed |= CC->enableNonTemporal(MI); 1241 return Changed; 1242 } 1243 1244 return Changed; 1245 } 1246 1247 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1248 MachineBasicBlock::iterator &MI) { 1249 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1250 1251 AtomicPseudoMIs.push_back(MI); 1252 bool Changed = false; 1253 1254 if (MOI.isAtomic()) { 1255 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1256 MOI.getOrdering() == AtomicOrdering::Release || 1257 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1258 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1259 /// TODO: This relies on a barrier always generating a waitcnt 1260 /// for LDS to ensure it is not reordered with the completion of 1261 /// the proceeding LDS operations. If barrier had a memory 1262 /// ordering and memory scope, then library does not need to 1263 /// generate a fence. Could add support in this file for 1264 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1265 /// adding S_WAITCNT before a S_BARRIER. 1266 Changed |= CC->insertRelease(MI, MOI.getScope(), 1267 MOI.getOrderingAddrSpace(), 1268 MOI.getIsCrossAddressSpaceOrdering(), 1269 Position::BEFORE); 1270 1271 // TODO: If both release and invalidate are happening they could be combined 1272 // to use the single "BUFFER_WBL2" instruction. This could be done by 1273 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1274 // track cache invalidate and write back instructions. 1275 1276 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1277 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1278 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1279 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1280 MOI.getOrderingAddrSpace(), 1281 Position::BEFORE); 1282 1283 return Changed; 1284 } 1285 1286 return Changed; 1287 } 1288 1289 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1290 MachineBasicBlock::iterator &MI) { 1291 assert(MI->mayLoad() && MI->mayStore()); 1292 1293 bool Changed = false; 1294 1295 if (MOI.isAtomic()) { 1296 if (MOI.getOrdering() == AtomicOrdering::Release || 1297 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1298 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1299 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1300 Changed |= CC->insertRelease(MI, MOI.getScope(), 1301 MOI.getOrderingAddrSpace(), 1302 MOI.getIsCrossAddressSpaceOrdering(), 1303 Position::BEFORE); 1304 1305 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1306 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1307 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1308 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1309 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1310 Changed |= CC->insertWait(MI, MOI.getScope(), 1311 MOI.getOrderingAddrSpace(), 1312 isAtomicRet(*MI) ? SIMemOp::LOAD : 1313 SIMemOp::STORE, 1314 MOI.getIsCrossAddressSpaceOrdering(), 1315 Position::AFTER); 1316 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1317 MOI.getOrderingAddrSpace(), 1318 Position::AFTER); 1319 } 1320 1321 return Changed; 1322 } 1323 1324 return Changed; 1325 } 1326 1327 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1328 bool Changed = false; 1329 1330 SIMemOpAccess MOA(MF); 1331 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1332 1333 for (auto &MBB : MF) { 1334 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1335 1336 if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) { 1337 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1338 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1339 I != E && I->isBundledWithPred(); ++I) { 1340 I->unbundleFromPred(); 1341 for (MachineOperand &MO : I->operands()) 1342 if (MO.isReg()) 1343 MO.setIsInternalRead(false); 1344 } 1345 1346 MI->eraseFromParent(); 1347 MI = II->getIterator(); 1348 } 1349 1350 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1351 continue; 1352 1353 if (const auto &MOI = MOA.getLoadInfo(MI)) 1354 Changed |= expandLoad(MOI.getValue(), MI); 1355 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1356 Changed |= expandStore(MOI.getValue(), MI); 1357 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1358 Changed |= expandAtomicFence(MOI.getValue(), MI); 1359 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1360 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1361 } 1362 } 1363 1364 Changed |= removeAtomicPseudoMIs(); 1365 return Changed; 1366 } 1367 1368 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1369 1370 char SIMemoryLegalizer::ID = 0; 1371 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1372 1373 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1374 return new SIMemoryLegalizer(); 1375 } 1376