10b57cec5SDimitry Andric //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass optimizes atomic operations by using a single lane of a wavefront 110b57cec5SDimitry Andric /// to perform the atomic operation, thus reducing contention on that memory 120b57cec5SDimitry Andric /// location. 1306c3fb27SDimitry Andric /// Atomic optimizer uses following strategies to compute scan and reduced 1406c3fb27SDimitry Andric /// values 1506c3fb27SDimitry Andric /// 1. DPP - 1606c3fb27SDimitry Andric /// This is the most efficient implementation for scan. DPP uses Whole Wave 1706c3fb27SDimitry Andric /// Mode (WWM) 1806c3fb27SDimitry Andric /// 2. Iterative - 1906c3fb27SDimitry Andric // An alternative implementation iterates over all active lanes 2006c3fb27SDimitry Andric /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 2106c3fb27SDimitry Andric /// intrinsics 220b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 230b57cec5SDimitry Andric 240b57cec5SDimitry Andric #include "AMDGPU.h" 25e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 2606c3fb27SDimitry Andric #include "llvm/Analysis/DomTreeUpdater.h" 2706c3fb27SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h" 280b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 290b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 300b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 31e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 32480093f4SDimitry Andric #include "llvm/InitializePasses.h" 33e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 340b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 350b57cec5SDimitry Andric 360b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-atomic-optimizer" 370b57cec5SDimitry Andric 380b57cec5SDimitry Andric using namespace llvm; 398bcb0991SDimitry Andric using namespace llvm::AMDGPU; 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric namespace { 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric struct ReplacementInfo { 440b57cec5SDimitry Andric Instruction *I; 450b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 460b57cec5SDimitry Andric unsigned ValIdx; 470b57cec5SDimitry Andric bool ValDivergent; 480b57cec5SDimitry Andric }; 490b57cec5SDimitry Andric 5006c3fb27SDimitry Andric class AMDGPUAtomicOptimizer : public FunctionPass { 5106c3fb27SDimitry Andric public: 5206c3fb27SDimitry Andric static char ID; 5306c3fb27SDimitry Andric ScanOptions ScanImpl; 5406c3fb27SDimitry Andric AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 5506c3fb27SDimitry Andric : FunctionPass(ID), ScanImpl(ScanImpl) {} 5606c3fb27SDimitry Andric 5706c3fb27SDimitry Andric bool runOnFunction(Function &F) override; 5806c3fb27SDimitry Andric 5906c3fb27SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 6006c3fb27SDimitry Andric AU.addPreserved<DominatorTreeWrapperPass>(); 6106c3fb27SDimitry Andric AU.addRequired<UniformityInfoWrapperPass>(); 6206c3fb27SDimitry Andric AU.addRequired<TargetPassConfig>(); 6306c3fb27SDimitry Andric } 6406c3fb27SDimitry Andric }; 6506c3fb27SDimitry Andric 6606c3fb27SDimitry Andric class AMDGPUAtomicOptimizerImpl 6706c3fb27SDimitry Andric : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 680b57cec5SDimitry Andric private: 690b57cec5SDimitry Andric SmallVector<ReplacementInfo, 8> ToReplace; 7006c3fb27SDimitry Andric const UniformityInfo *UA; 710b57cec5SDimitry Andric const DataLayout *DL; 7206c3fb27SDimitry Andric DomTreeUpdater &DTU; 738bcb0991SDimitry Andric const GCNSubtarget *ST; 740b57cec5SDimitry Andric bool IsPixelShader; 7506c3fb27SDimitry Andric ScanOptions ScanImpl; 760b57cec5SDimitry Andric 77fe6060f1SDimitry Andric Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78fe6060f1SDimitry Andric Value *const Identity) const; 798bcb0991SDimitry Andric Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 808bcb0991SDimitry Andric Value *const Identity) const; 818bcb0991SDimitry Andric Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 8206c3fb27SDimitry Andric 8306c3fb27SDimitry Andric std::pair<Value *, Value *> 8406c3fb27SDimitry Andric buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 8506c3fb27SDimitry Andric Value *const Identity, Value *V, Instruction &I, 8606c3fb27SDimitry Andric BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 8706c3fb27SDimitry Andric 880b57cec5SDimitry Andric void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 890b57cec5SDimitry Andric bool ValDivergent) const; 900b57cec5SDimitry Andric 910b57cec5SDimitry Andric public: 9206c3fb27SDimitry Andric AMDGPUAtomicOptimizerImpl() = delete; 930b57cec5SDimitry Andric 9406c3fb27SDimitry Andric AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 9506c3fb27SDimitry Andric DomTreeUpdater &DTU, const GCNSubtarget *ST, 9606c3fb27SDimitry Andric bool IsPixelShader, ScanOptions ScanImpl) 9706c3fb27SDimitry Andric : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 9806c3fb27SDimitry Andric ScanImpl(ScanImpl) {} 990b57cec5SDimitry Andric 10006c3fb27SDimitry Andric bool run(Function &F); 1010b57cec5SDimitry Andric 1020b57cec5SDimitry Andric void visitAtomicRMWInst(AtomicRMWInst &I); 1030b57cec5SDimitry Andric void visitIntrinsicInst(IntrinsicInst &I); 1040b57cec5SDimitry Andric }; 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric } // namespace 1070b57cec5SDimitry Andric 1080b57cec5SDimitry Andric char AMDGPUAtomicOptimizer::ID = 0; 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 1110b57cec5SDimitry Andric 1120b57cec5SDimitry Andric bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 1130b57cec5SDimitry Andric if (skipFunction(F)) { 1140b57cec5SDimitry Andric return false; 1150b57cec5SDimitry Andric } 1160b57cec5SDimitry Andric 11706c3fb27SDimitry Andric const UniformityInfo *UA = 11806c3fb27SDimitry Andric &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119*0fca6ea1SDimitry Andric const DataLayout *DL = &F.getDataLayout(); 12006c3fb27SDimitry Andric 1210b57cec5SDimitry Andric DominatorTreeWrapperPass *const DTW = 1220b57cec5SDimitry Andric getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 12306c3fb27SDimitry Andric DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 12406c3fb27SDimitry Andric DomTreeUpdater::UpdateStrategy::Lazy); 12506c3fb27SDimitry Andric 1260b57cec5SDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 1270b57cec5SDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>(); 12806c3fb27SDimitry Andric const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 12906c3fb27SDimitry Andric 13006c3fb27SDimitry Andric bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 13106c3fb27SDimitry Andric 13206c3fb27SDimitry Andric return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 13306c3fb27SDimitry Andric .run(F); 13406c3fb27SDimitry Andric } 13506c3fb27SDimitry Andric 13606c3fb27SDimitry Andric PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 13706c3fb27SDimitry Andric FunctionAnalysisManager &AM) { 13806c3fb27SDimitry Andric 13906c3fb27SDimitry Andric const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140*0fca6ea1SDimitry Andric const DataLayout *DL = &F.getDataLayout(); 14106c3fb27SDimitry Andric 14206c3fb27SDimitry Andric DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 14306c3fb27SDimitry Andric DomTreeUpdater::UpdateStrategy::Lazy); 14406c3fb27SDimitry Andric const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 14506c3fb27SDimitry Andric 14606c3fb27SDimitry Andric bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 14706c3fb27SDimitry Andric 14806c3fb27SDimitry Andric bool IsChanged = 14906c3fb27SDimitry Andric AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 15006c3fb27SDimitry Andric .run(F); 15106c3fb27SDimitry Andric 15206c3fb27SDimitry Andric if (!IsChanged) { 15306c3fb27SDimitry Andric return PreservedAnalyses::all(); 15406c3fb27SDimitry Andric } 15506c3fb27SDimitry Andric 15606c3fb27SDimitry Andric PreservedAnalyses PA; 15706c3fb27SDimitry Andric PA.preserve<DominatorTreeAnalysis>(); 15806c3fb27SDimitry Andric return PA; 15906c3fb27SDimitry Andric } 16006c3fb27SDimitry Andric 16106c3fb27SDimitry Andric bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 16206c3fb27SDimitry Andric 16306c3fb27SDimitry Andric // Scan option None disables the Pass 16406c3fb27SDimitry Andric if (ScanImpl == ScanOptions::None) { 16506c3fb27SDimitry Andric return false; 16606c3fb27SDimitry Andric } 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric visit(F); 1690b57cec5SDimitry Andric 1700b57cec5SDimitry Andric const bool Changed = !ToReplace.empty(); 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric for (ReplacementInfo &Info : ToReplace) { 1730b57cec5SDimitry Andric optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 1740b57cec5SDimitry Andric } 1750b57cec5SDimitry Andric 1760b57cec5SDimitry Andric ToReplace.clear(); 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric return Changed; 1790b57cec5SDimitry Andric } 1800b57cec5SDimitry Andric 181*0fca6ea1SDimitry Andric static bool isLegalCrossLaneType(Type *Ty) { 182*0fca6ea1SDimitry Andric switch (Ty->getTypeID()) { 183*0fca6ea1SDimitry Andric case Type::FloatTyID: 184*0fca6ea1SDimitry Andric case Type::DoubleTyID: 185*0fca6ea1SDimitry Andric return true; 186*0fca6ea1SDimitry Andric case Type::IntegerTyID: { 187*0fca6ea1SDimitry Andric unsigned Size = Ty->getIntegerBitWidth(); 188*0fca6ea1SDimitry Andric return (Size == 32 || Size == 64); 189*0fca6ea1SDimitry Andric } 190*0fca6ea1SDimitry Andric default: 191*0fca6ea1SDimitry Andric return false; 192*0fca6ea1SDimitry Andric } 193*0fca6ea1SDimitry Andric } 194*0fca6ea1SDimitry Andric 19506c3fb27SDimitry Andric void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 1960b57cec5SDimitry Andric // Early exit for unhandled address space atomic instructions. 1970b57cec5SDimitry Andric switch (I.getPointerAddressSpace()) { 1980b57cec5SDimitry Andric default: 1990b57cec5SDimitry Andric return; 2000b57cec5SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 2010b57cec5SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2020b57cec5SDimitry Andric break; 2030b57cec5SDimitry Andric } 2040b57cec5SDimitry Andric 2050b57cec5SDimitry Andric AtomicRMWInst::BinOp Op = I.getOperation(); 2060b57cec5SDimitry Andric 2070b57cec5SDimitry Andric switch (Op) { 2080b57cec5SDimitry Andric default: 2090b57cec5SDimitry Andric return; 2100b57cec5SDimitry Andric case AtomicRMWInst::Add: 2110b57cec5SDimitry Andric case AtomicRMWInst::Sub: 2120b57cec5SDimitry Andric case AtomicRMWInst::And: 2130b57cec5SDimitry Andric case AtomicRMWInst::Or: 2140b57cec5SDimitry Andric case AtomicRMWInst::Xor: 2150b57cec5SDimitry Andric case AtomicRMWInst::Max: 2160b57cec5SDimitry Andric case AtomicRMWInst::Min: 2170b57cec5SDimitry Andric case AtomicRMWInst::UMax: 2180b57cec5SDimitry Andric case AtomicRMWInst::UMin: 2195f757f3fSDimitry Andric case AtomicRMWInst::FAdd: 2205f757f3fSDimitry Andric case AtomicRMWInst::FSub: 2215f757f3fSDimitry Andric case AtomicRMWInst::FMax: 2225f757f3fSDimitry Andric case AtomicRMWInst::FMin: 2230b57cec5SDimitry Andric break; 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric 226*0fca6ea1SDimitry Andric // Only 32 and 64 bit floating point atomic ops are supported. 227*0fca6ea1SDimitry Andric if (AtomicRMWInst::isFPOperation(Op) && 228*0fca6ea1SDimitry Andric !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 2295f757f3fSDimitry Andric return; 2305f757f3fSDimitry Andric } 2315f757f3fSDimitry Andric 2320b57cec5SDimitry Andric const unsigned PtrIdx = 0; 2330b57cec5SDimitry Andric const unsigned ValIdx = 1; 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric // If the pointer operand is divergent, then each lane is doing an atomic 2360b57cec5SDimitry Andric // operation on a different address, and we cannot optimize that. 23706c3fb27SDimitry Andric if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 2380b57cec5SDimitry Andric return; 2390b57cec5SDimitry Andric } 2400b57cec5SDimitry Andric 241*0fca6ea1SDimitry Andric bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 2440b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 245*0fca6ea1SDimitry Andric // we have DPP available on our subtarget (for DPP strategy), and the atomic 246*0fca6ea1SDimitry Andric // operation is 32 or 64 bits. 247*0fca6ea1SDimitry Andric if (ValDivergent) { 248*0fca6ea1SDimitry Andric if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) 249*0fca6ea1SDimitry Andric return; 250*0fca6ea1SDimitry Andric 251*0fca6ea1SDimitry Andric if (!isLegalCrossLaneType(I.getType())) 2520b57cec5SDimitry Andric return; 2530b57cec5SDimitry Andric } 2540b57cec5SDimitry Andric 2550b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 2560b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 2570b57cec5SDimitry Andric // remember the instruction so we can come back to it. 2580b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric ToReplace.push_back(Info); 2610b57cec5SDimitry Andric } 2620b57cec5SDimitry Andric 26306c3fb27SDimitry Andric void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 2640b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 2650b57cec5SDimitry Andric 2660b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 2670b57cec5SDimitry Andric default: 2680b57cec5SDimitry Andric return; 2690b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 27006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 2710b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 27206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 2730b57cec5SDimitry Andric Op = AtomicRMWInst::Add; 2740b57cec5SDimitry Andric break; 2750b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 27606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 2770b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 27806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 2790b57cec5SDimitry Andric Op = AtomicRMWInst::Sub; 2800b57cec5SDimitry Andric break; 2810b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 28206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 2830b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 28406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 2850b57cec5SDimitry Andric Op = AtomicRMWInst::And; 2860b57cec5SDimitry Andric break; 2870b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 28806c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 2890b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 29006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 2910b57cec5SDimitry Andric Op = AtomicRMWInst::Or; 2920b57cec5SDimitry Andric break; 2930b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 29406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 2950b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 29606c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 2970b57cec5SDimitry Andric Op = AtomicRMWInst::Xor; 2980b57cec5SDimitry Andric break; 2990b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 30006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 3010b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 30206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 3030b57cec5SDimitry Andric Op = AtomicRMWInst::Min; 3040b57cec5SDimitry Andric break; 3050b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 30606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 3070b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 30806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 3090b57cec5SDimitry Andric Op = AtomicRMWInst::UMin; 3100b57cec5SDimitry Andric break; 3110b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 31206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 3130b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 31406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 3150b57cec5SDimitry Andric Op = AtomicRMWInst::Max; 3160b57cec5SDimitry Andric break; 3170b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 31806c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 3190b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 32006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 3210b57cec5SDimitry Andric Op = AtomicRMWInst::UMax; 3220b57cec5SDimitry Andric break; 3230b57cec5SDimitry Andric } 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric const unsigned ValIdx = 0; 3260b57cec5SDimitry Andric 32706c3fb27SDimitry Andric const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 3300b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 331*0fca6ea1SDimitry Andric // we have DPP available on our subtarget (for DPP strategy), and the atomic 332*0fca6ea1SDimitry Andric // operation is 32 or 64 bits. 333*0fca6ea1SDimitry Andric if (ValDivergent) { 334*0fca6ea1SDimitry Andric if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) 335*0fca6ea1SDimitry Andric return; 336*0fca6ea1SDimitry Andric 337*0fca6ea1SDimitry Andric if (!isLegalCrossLaneType(I.getType())) 3380b57cec5SDimitry Andric return; 3390b57cec5SDimitry Andric } 3400b57cec5SDimitry Andric 3410b57cec5SDimitry Andric // If any of the other arguments to the intrinsic are divergent, we can't 3420b57cec5SDimitry Andric // optimize the operation. 3430b57cec5SDimitry Andric for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 34406c3fb27SDimitry Andric if (UA->isDivergentUse(I.getOperandUse(Idx))) { 3450b57cec5SDimitry Andric return; 3460b57cec5SDimitry Andric } 3470b57cec5SDimitry Andric } 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 3500b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 3510b57cec5SDimitry Andric // remember the instruction so we can come back to it. 3520b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric ToReplace.push_back(Info); 3550b57cec5SDimitry Andric } 3560b57cec5SDimitry Andric 3570b57cec5SDimitry Andric // Use the builder to create the non-atomic counterpart of the specified 3580b57cec5SDimitry Andric // atomicrmw binary op. 3590b57cec5SDimitry Andric static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 3600b57cec5SDimitry Andric Value *LHS, Value *RHS) { 3610b57cec5SDimitry Andric CmpInst::Predicate Pred; 3620b57cec5SDimitry Andric 3630b57cec5SDimitry Andric switch (Op) { 3640b57cec5SDimitry Andric default: 3650b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 3660b57cec5SDimitry Andric case AtomicRMWInst::Add: 3670b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Add, LHS, RHS); 3685f757f3fSDimitry Andric case AtomicRMWInst::FAdd: 3695f757f3fSDimitry Andric return B.CreateFAdd(LHS, RHS); 3700b57cec5SDimitry Andric case AtomicRMWInst::Sub: 3710b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Sub, LHS, RHS); 3725f757f3fSDimitry Andric case AtomicRMWInst::FSub: 3735f757f3fSDimitry Andric return B.CreateFSub(LHS, RHS); 3740b57cec5SDimitry Andric case AtomicRMWInst::And: 3750b57cec5SDimitry Andric return B.CreateBinOp(Instruction::And, LHS, RHS); 3760b57cec5SDimitry Andric case AtomicRMWInst::Or: 3770b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Or, LHS, RHS); 3780b57cec5SDimitry Andric case AtomicRMWInst::Xor: 3790b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Xor, LHS, RHS); 3800b57cec5SDimitry Andric 3810b57cec5SDimitry Andric case AtomicRMWInst::Max: 3820b57cec5SDimitry Andric Pred = CmpInst::ICMP_SGT; 3830b57cec5SDimitry Andric break; 3840b57cec5SDimitry Andric case AtomicRMWInst::Min: 3850b57cec5SDimitry Andric Pred = CmpInst::ICMP_SLT; 3860b57cec5SDimitry Andric break; 3870b57cec5SDimitry Andric case AtomicRMWInst::UMax: 3880b57cec5SDimitry Andric Pred = CmpInst::ICMP_UGT; 3890b57cec5SDimitry Andric break; 3900b57cec5SDimitry Andric case AtomicRMWInst::UMin: 3910b57cec5SDimitry Andric Pred = CmpInst::ICMP_ULT; 3920b57cec5SDimitry Andric break; 3935f757f3fSDimitry Andric case AtomicRMWInst::FMax: 3945f757f3fSDimitry Andric return B.CreateMaxNum(LHS, RHS); 3955f757f3fSDimitry Andric case AtomicRMWInst::FMin: 3965f757f3fSDimitry Andric return B.CreateMinNum(LHS, RHS); 3970b57cec5SDimitry Andric } 3980b57cec5SDimitry Andric Value *Cond = B.CreateICmp(Pred, LHS, RHS); 3990b57cec5SDimitry Andric return B.CreateSelect(Cond, LHS, RHS); 4000b57cec5SDimitry Andric } 4010b57cec5SDimitry Andric 402fe6060f1SDimitry Andric // Use the builder to create a reduction of V across the wavefront, with all 403fe6060f1SDimitry Andric // lanes active, returning the same result in all lanes. 40406c3fb27SDimitry Andric Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 40506c3fb27SDimitry Andric AtomicRMWInst::BinOp Op, 40606c3fb27SDimitry Andric Value *V, 407fe6060f1SDimitry Andric Value *const Identity) const { 4085f757f3fSDimitry Andric Type *AtomicTy = V->getType(); 409fe6060f1SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 410fe6060f1SDimitry Andric Function *UpdateDPP = 4115f757f3fSDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 412fe6060f1SDimitry Andric 413fe6060f1SDimitry Andric // Reduce within each row of 16 lanes. 414fe6060f1SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 415fe6060f1SDimitry Andric V = buildNonAtomicBinOp( 416fe6060f1SDimitry Andric B, Op, V, 417fe6060f1SDimitry Andric B.CreateCall(UpdateDPP, 418fe6060f1SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 419fe6060f1SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 420fe6060f1SDimitry Andric } 421fe6060f1SDimitry Andric 422fe6060f1SDimitry Andric // Reduce within each pair of rows (i.e. 32 lanes). 423fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 4245f757f3fSDimitry Andric Value *Permlanex16Call = B.CreateIntrinsic( 425*0fca6ea1SDimitry Andric V->getType(), Intrinsic::amdgcn_permlanex16, 4265f757f3fSDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 427*0fca6ea1SDimitry Andric V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); 4285f757f3fSDimitry Andric if (ST->isWave32()) { 429fe6060f1SDimitry Andric return V; 4305f757f3fSDimitry Andric } 431fe6060f1SDimitry Andric 43281ad6265SDimitry Andric if (ST->hasPermLane64()) { 43381ad6265SDimitry Andric // Reduce across the upper and lower 32 lanes. 4345f757f3fSDimitry Andric Value *Permlane64Call = 435*0fca6ea1SDimitry Andric B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); 436*0fca6ea1SDimitry Andric return buildNonAtomicBinOp(B, Op, V, Permlane64Call); 43781ad6265SDimitry Andric } 43881ad6265SDimitry Andric 439fe6060f1SDimitry Andric // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 440fe6060f1SDimitry Andric // combine them with a scalar operation. 441fe6060f1SDimitry Andric Function *ReadLane = 442*0fca6ea1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 4435f757f3fSDimitry Andric Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 4445f757f3fSDimitry Andric Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 445*0fca6ea1SDimitry Andric return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 446fe6060f1SDimitry Andric } 447fe6060f1SDimitry Andric 4488bcb0991SDimitry Andric // Use the builder to create an inclusive scan of V across the wavefront, with 4498bcb0991SDimitry Andric // all lanes active. 45006c3fb27SDimitry Andric Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 45106c3fb27SDimitry Andric AtomicRMWInst::BinOp Op, Value *V, 4525f757f3fSDimitry Andric Value *Identity) const { 4535f757f3fSDimitry Andric Type *AtomicTy = V->getType(); 4548bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 4558bcb0991SDimitry Andric Function *UpdateDPP = 4565f757f3fSDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 4578bcb0991SDimitry Andric 4588bcb0991SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 4598bcb0991SDimitry Andric V = buildNonAtomicBinOp( 4608bcb0991SDimitry Andric B, Op, V, 4618bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 4628bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 4638bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 4648bcb0991SDimitry Andric } 4658bcb0991SDimitry Andric if (ST->hasDPPBroadcasts()) { 4668bcb0991SDimitry Andric // GFX9 has DPP row broadcast operations. 4678bcb0991SDimitry Andric V = buildNonAtomicBinOp( 4688bcb0991SDimitry Andric B, Op, V, 4698bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 4708bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 4718bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 4728bcb0991SDimitry Andric V = buildNonAtomicBinOp( 4738bcb0991SDimitry Andric B, Op, V, 4748bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 4758bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 4768bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 4778bcb0991SDimitry Andric } else { 4788bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 4798bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 4808bcb0991SDimitry Andric 4818bcb0991SDimitry Andric // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 4828bcb0991SDimitry Andric // 48..63). 483fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 4845f757f3fSDimitry Andric Value *PermX = B.CreateIntrinsic( 485*0fca6ea1SDimitry Andric V->getType(), Intrinsic::amdgcn_permlanex16, 486fe6060f1SDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 4875f757f3fSDimitry Andric 488*0fca6ea1SDimitry Andric Value *UpdateDPPCall = B.CreateCall( 489*0fca6ea1SDimitry Andric UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 490*0fca6ea1SDimitry Andric B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); 491*0fca6ea1SDimitry Andric V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 4925f757f3fSDimitry Andric 4938bcb0991SDimitry Andric if (!ST->isWave32()) { 4948bcb0991SDimitry Andric // Combine lane 31 into lanes 32..63. 495*0fca6ea1SDimitry Andric Value *const Lane31 = B.CreateIntrinsic( 496*0fca6ea1SDimitry Andric V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 4975f757f3fSDimitry Andric 4985f757f3fSDimitry Andric Value *UpdateDPPCall = B.CreateCall( 4995f757f3fSDimitry Andric UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 5005f757f3fSDimitry Andric B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 5015f757f3fSDimitry Andric 502*0fca6ea1SDimitry Andric V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 5038bcb0991SDimitry Andric } 5048bcb0991SDimitry Andric } 5058bcb0991SDimitry Andric return V; 5068bcb0991SDimitry Andric } 5078bcb0991SDimitry Andric 5088bcb0991SDimitry Andric // Use the builder to create a shift right of V across the wavefront, with all 5098bcb0991SDimitry Andric // lanes active, to turn an inclusive scan into an exclusive scan. 51006c3fb27SDimitry Andric Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 5115f757f3fSDimitry Andric Value *Identity) const { 5125f757f3fSDimitry Andric Type *AtomicTy = V->getType(); 5138bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 5148bcb0991SDimitry Andric Function *UpdateDPP = 5155f757f3fSDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 5168bcb0991SDimitry Andric if (ST->hasDPPWavefrontShifts()) { 5178bcb0991SDimitry Andric // GFX9 has DPP wavefront shift operations. 5188bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 5198bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 5208bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()}); 5218bcb0991SDimitry Andric } else { 522fe6060f1SDimitry Andric Function *ReadLane = 523*0fca6ea1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 524fe6060f1SDimitry Andric Function *WriteLane = 525*0fca6ea1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); 526fe6060f1SDimitry Andric 5278bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 5288bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 5298bcb0991SDimitry Andric Value *Old = V; 5308bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 5318bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 5328bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 5338bcb0991SDimitry Andric 5348bcb0991SDimitry Andric // Copy the old lane 15 to the new lane 16. 535*0fca6ea1SDimitry Andric V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 536*0fca6ea1SDimitry Andric B.getInt32(16), V}); 537*0fca6ea1SDimitry Andric 5385f757f3fSDimitry Andric if (!ST->isWave32()) { 5395f757f3fSDimitry Andric // Copy the old lane 31 to the new lane 32. 540*0fca6ea1SDimitry Andric V = B.CreateCall( 541*0fca6ea1SDimitry Andric WriteLane, 542*0fca6ea1SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 5438bcb0991SDimitry Andric 5448bcb0991SDimitry Andric // Copy the old lane 47 to the new lane 48. 5458bcb0991SDimitry Andric V = B.CreateCall( 5468bcb0991SDimitry Andric WriteLane, 5478bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 5488bcb0991SDimitry Andric } 5498bcb0991SDimitry Andric } 5508bcb0991SDimitry Andric 5518bcb0991SDimitry Andric return V; 5528bcb0991SDimitry Andric } 5538bcb0991SDimitry Andric 55406c3fb27SDimitry Andric // Use the builder to create an exclusive scan and compute the final reduced 55506c3fb27SDimitry Andric // value using an iterative approach. This provides an alternative 55606c3fb27SDimitry Andric // implementation to DPP which uses WMM for scan computations. This API iterate 55706c3fb27SDimitry Andric // over active lanes to read, compute and update the value using 55806c3fb27SDimitry Andric // readlane and writelane intrinsics. 55906c3fb27SDimitry Andric std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 56006c3fb27SDimitry Andric IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 56106c3fb27SDimitry Andric Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 56206c3fb27SDimitry Andric auto *Ty = I.getType(); 56306c3fb27SDimitry Andric auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 56406c3fb27SDimitry Andric auto *EntryBB = I.getParent(); 56506c3fb27SDimitry Andric auto NeedResult = !I.use_empty(); 56606c3fb27SDimitry Andric 56706c3fb27SDimitry Andric auto *Ballot = 56806c3fb27SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 56906c3fb27SDimitry Andric 57006c3fb27SDimitry Andric // Start inserting instructions for ComputeLoop block 57106c3fb27SDimitry Andric B.SetInsertPoint(ComputeLoop); 57206c3fb27SDimitry Andric // Phi nodes for Accumulator, Scan results destination, and Active Lanes 57306c3fb27SDimitry Andric auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 57406c3fb27SDimitry Andric Accumulator->addIncoming(Identity, EntryBB); 57506c3fb27SDimitry Andric PHINode *OldValuePhi = nullptr; 57606c3fb27SDimitry Andric if (NeedResult) { 57706c3fb27SDimitry Andric OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 57806c3fb27SDimitry Andric OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 57906c3fb27SDimitry Andric } 58006c3fb27SDimitry Andric auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 58106c3fb27SDimitry Andric ActiveBits->addIncoming(Ballot, EntryBB); 58206c3fb27SDimitry Andric 58306c3fb27SDimitry Andric // Use llvm.cttz instrinsic to find the lowest remaining active lane. 58406c3fb27SDimitry Andric auto *FF1 = 58506c3fb27SDimitry Andric B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 5865f757f3fSDimitry Andric 587*0fca6ea1SDimitry Andric auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); 58806c3fb27SDimitry Andric 58906c3fb27SDimitry Andric // Get the value required for atomic operation 590*0fca6ea1SDimitry Andric Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 591*0fca6ea1SDimitry Andric {V, LaneIdxInt}); 59206c3fb27SDimitry Andric 59306c3fb27SDimitry Andric // Perform writelane if intermediate scan results are required later in the 59406c3fb27SDimitry Andric // kernel computations 59506c3fb27SDimitry Andric Value *OldValue = nullptr; 59606c3fb27SDimitry Andric if (NeedResult) { 597*0fca6ea1SDimitry Andric OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, 598*0fca6ea1SDimitry Andric {Accumulator, LaneIdxInt, OldValuePhi}); 59906c3fb27SDimitry Andric OldValuePhi->addIncoming(OldValue, ComputeLoop); 60006c3fb27SDimitry Andric } 60106c3fb27SDimitry Andric 60206c3fb27SDimitry Andric // Accumulate the results 60306c3fb27SDimitry Andric auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 60406c3fb27SDimitry Andric Accumulator->addIncoming(NewAccumulator, ComputeLoop); 60506c3fb27SDimitry Andric 60606c3fb27SDimitry Andric // Set bit to zero of current active lane so that for next iteration llvm.cttz 60706c3fb27SDimitry Andric // return the next active lane 60806c3fb27SDimitry Andric auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 60906c3fb27SDimitry Andric 61006c3fb27SDimitry Andric auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 61106c3fb27SDimitry Andric auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 61206c3fb27SDimitry Andric ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 61306c3fb27SDimitry Andric 61406c3fb27SDimitry Andric // Branch out of the loop when all lanes are processed. 61506c3fb27SDimitry Andric auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 61606c3fb27SDimitry Andric B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 61706c3fb27SDimitry Andric 61806c3fb27SDimitry Andric B.SetInsertPoint(ComputeEnd); 61906c3fb27SDimitry Andric 62006c3fb27SDimitry Andric return {OldValue, NewAccumulator}; 62106c3fb27SDimitry Andric } 62206c3fb27SDimitry Andric 6235f757f3fSDimitry Andric static Constant *getIdentityValueForAtomicOp(Type *const Ty, 6245f757f3fSDimitry Andric AtomicRMWInst::BinOp Op) { 6255f757f3fSDimitry Andric LLVMContext &C = Ty->getContext(); 6265f757f3fSDimitry Andric const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 6270b57cec5SDimitry Andric switch (Op) { 6280b57cec5SDimitry Andric default: 6290b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 6300b57cec5SDimitry Andric case AtomicRMWInst::Add: 6310b57cec5SDimitry Andric case AtomicRMWInst::Sub: 6320b57cec5SDimitry Andric case AtomicRMWInst::Or: 6330b57cec5SDimitry Andric case AtomicRMWInst::Xor: 6340b57cec5SDimitry Andric case AtomicRMWInst::UMax: 6355f757f3fSDimitry Andric return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 6360b57cec5SDimitry Andric case AtomicRMWInst::And: 6370b57cec5SDimitry Andric case AtomicRMWInst::UMin: 6385f757f3fSDimitry Andric return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 6390b57cec5SDimitry Andric case AtomicRMWInst::Max: 6405f757f3fSDimitry Andric return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 6410b57cec5SDimitry Andric case AtomicRMWInst::Min: 6425f757f3fSDimitry Andric return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 6435f757f3fSDimitry Andric case AtomicRMWInst::FAdd: 6445f757f3fSDimitry Andric return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 6455f757f3fSDimitry Andric case AtomicRMWInst::FSub: 6465f757f3fSDimitry Andric return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 6475f757f3fSDimitry Andric case AtomicRMWInst::FMin: 6485f757f3fSDimitry Andric case AtomicRMWInst::FMax: 649*0fca6ea1SDimitry Andric // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the 650*0fca6ea1SDimitry Andric // closest thing they have to an identity, but it still does not preserve 651*0fca6ea1SDimitry Andric // the difference between quiet and signaling NaNs or NaNs with different 652*0fca6ea1SDimitry Andric // payloads. 653*0fca6ea1SDimitry Andric return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); 6540b57cec5SDimitry Andric } 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 657e8d8bef9SDimitry Andric static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 658e8d8bef9SDimitry Andric const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 659e8d8bef9SDimitry Andric return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 660e8d8bef9SDimitry Andric } 661e8d8bef9SDimitry Andric 66206c3fb27SDimitry Andric void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 6630b57cec5SDimitry Andric AtomicRMWInst::BinOp Op, 6640b57cec5SDimitry Andric unsigned ValIdx, 6650b57cec5SDimitry Andric bool ValDivergent) const { 6660b57cec5SDimitry Andric // Start building just before the instruction. 6670b57cec5SDimitry Andric IRBuilder<> B(&I); 6680b57cec5SDimitry Andric 6695f757f3fSDimitry Andric if (AtomicRMWInst::isFPOperation(Op)) { 6705f757f3fSDimitry Andric B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 6715f757f3fSDimitry Andric } 6725f757f3fSDimitry Andric 6730b57cec5SDimitry Andric // If we are in a pixel shader, because of how we have to mask out helper 6740b57cec5SDimitry Andric // lane invocations, we need to record the entry and exit BB's. 6750b57cec5SDimitry Andric BasicBlock *PixelEntryBB = nullptr; 6760b57cec5SDimitry Andric BasicBlock *PixelExitBB = nullptr; 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric // If we're optimizing an atomic within a pixel shader, we need to wrap the 6790b57cec5SDimitry Andric // entire atomic operation in a helper-lane check. We do not want any helper 6800b57cec5SDimitry Andric // lanes that are around only for the purposes of derivatives to take part 6810b57cec5SDimitry Andric // in any cross-lane communication, and we use a branch on whether the lane is 6820b57cec5SDimitry Andric // live to do this. 6830b57cec5SDimitry Andric if (IsPixelShader) { 6840b57cec5SDimitry Andric // Record I's original position as the entry block. 6850b57cec5SDimitry Andric PixelEntryBB = I.getParent(); 6860b57cec5SDimitry Andric 6870b57cec5SDimitry Andric Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 6880b57cec5SDimitry Andric Instruction *const NonHelperTerminator = 68906c3fb27SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 6900b57cec5SDimitry Andric 6910b57cec5SDimitry Andric // Record I's new position as the exit block. 6920b57cec5SDimitry Andric PixelExitBB = I.getParent(); 6930b57cec5SDimitry Andric 6940b57cec5SDimitry Andric I.moveBefore(NonHelperTerminator); 6950b57cec5SDimitry Andric B.SetInsertPoint(&I); 6960b57cec5SDimitry Andric } 6970b57cec5SDimitry Andric 6980b57cec5SDimitry Andric Type *const Ty = I.getType(); 6995f757f3fSDimitry Andric Type *Int32Ty = B.getInt32Ty(); 7005f757f3fSDimitry Andric bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 701*0fca6ea1SDimitry Andric [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 7020b57cec5SDimitry Andric 7030b57cec5SDimitry Andric // This is the value in the atomic operation we need to combine in order to 7040b57cec5SDimitry Andric // reduce the number of atomic operations. 7055f757f3fSDimitry Andric Value *V = I.getOperand(ValIdx); 7060b57cec5SDimitry Andric 7070b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront, and we do 7080b57cec5SDimitry Andric // this by doing a ballot of active lanes. 7098bcb0991SDimitry Andric Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 7105ffd83dbSDimitry Andric CallInst *const Ballot = 7115ffd83dbSDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 7120b57cec5SDimitry Andric 7130b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront that are 7140b57cec5SDimitry Andric // below us. If we counted each lane linearly starting from 0, a lane is 7150b57cec5SDimitry Andric // below us only if its associated index was less than ours. We do this by 7160b57cec5SDimitry Andric // using the mbcnt intrinsic. 7178bcb0991SDimitry Andric Value *Mbcnt; 7188bcb0991SDimitry Andric if (ST->isWave32()) { 7198bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 7208bcb0991SDimitry Andric {Ballot, B.getInt32(0)}); 7218bcb0991SDimitry Andric } else { 7225f757f3fSDimitry Andric Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 7235f757f3fSDimitry Andric Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 7248bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 7258bcb0991SDimitry Andric {ExtractLo, B.getInt32(0)}); 7268bcb0991SDimitry Andric Mbcnt = 7278bcb0991SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 7288bcb0991SDimitry Andric } 7290b57cec5SDimitry Andric 7305f757f3fSDimitry Andric Function *F = I.getFunction(); 7315f757f3fSDimitry Andric LLVMContext &C = F->getContext(); 7325f757f3fSDimitry Andric 7335f757f3fSDimitry Andric // For atomic sub, perform scan with add operation and allow one lane to 7345f757f3fSDimitry Andric // subtract the reduced value later. 7355f757f3fSDimitry Andric AtomicRMWInst::BinOp ScanOp = Op; 7365f757f3fSDimitry Andric if (Op == AtomicRMWInst::Sub) { 7375f757f3fSDimitry Andric ScanOp = AtomicRMWInst::Add; 7385f757f3fSDimitry Andric } else if (Op == AtomicRMWInst::FSub) { 7395f757f3fSDimitry Andric ScanOp = AtomicRMWInst::FAdd; 7405f757f3fSDimitry Andric } 7415f757f3fSDimitry Andric Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 7420b57cec5SDimitry Andric 7430b57cec5SDimitry Andric Value *ExclScan = nullptr; 7440b57cec5SDimitry Andric Value *NewV = nullptr; 7450b57cec5SDimitry Andric 746fe6060f1SDimitry Andric const bool NeedResult = !I.use_empty(); 747fe6060f1SDimitry Andric 74806c3fb27SDimitry Andric BasicBlock *ComputeLoop = nullptr; 74906c3fb27SDimitry Andric BasicBlock *ComputeEnd = nullptr; 7500b57cec5SDimitry Andric // If we have a divergent value in each lane, we need to combine the value 7510b57cec5SDimitry Andric // using DPP. 7520b57cec5SDimitry Andric if (ValDivergent) { 75306c3fb27SDimitry Andric if (ScanImpl == ScanOptions::DPP) { 7540b57cec5SDimitry Andric // First we need to set all inactive invocations to the identity value, so 7550b57cec5SDimitry Andric // that they can correctly contribute to the final result. 756*0fca6ea1SDimitry Andric NewV = 757*0fca6ea1SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 758fe6060f1SDimitry Andric if (!NeedResult && ST->hasPermLaneX16()) { 75906c3fb27SDimitry Andric // On GFX10 the permlanex16 instruction helps us build a reduction 76006c3fb27SDimitry Andric // without too many readlanes and writelanes, which are generally bad 76106c3fb27SDimitry Andric // for performance. 762fe6060f1SDimitry Andric NewV = buildReduction(B, ScanOp, NewV, Identity); 763fe6060f1SDimitry Andric } else { 7648bcb0991SDimitry Andric NewV = buildScan(B, ScanOp, NewV, Identity); 765fe6060f1SDimitry Andric if (NeedResult) 7668bcb0991SDimitry Andric ExclScan = buildShiftRight(B, NewV, Identity); 76706c3fb27SDimitry Andric // Read the value from the last lane, which has accumulated the values 76806c3fb27SDimitry Andric // of each active lane in the wavefront. This will be our new value 76906c3fb27SDimitry Andric // which we will provide to the atomic operation. 7708bcb0991SDimitry Andric Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 771*0fca6ea1SDimitry Andric NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, 7728bcb0991SDimitry Andric {NewV, LastLaneIdx}); 7730b57cec5SDimitry Andric } 7740b57cec5SDimitry Andric // Finally mark the readlanes in the WWM section. 775fe6060f1SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 77606c3fb27SDimitry Andric } else if (ScanImpl == ScanOptions::Iterative) { 77706c3fb27SDimitry Andric // Alternative implementation for scan 77806c3fb27SDimitry Andric ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 77906c3fb27SDimitry Andric ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 78006c3fb27SDimitry Andric std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 78106c3fb27SDimitry Andric ComputeLoop, ComputeEnd); 78206c3fb27SDimitry Andric } else { 78306c3fb27SDimitry Andric llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 78406c3fb27SDimitry Andric } 7850b57cec5SDimitry Andric } else { 7860b57cec5SDimitry Andric switch (Op) { 7870b57cec5SDimitry Andric default: 7880b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 7890b57cec5SDimitry Andric 7900b57cec5SDimitry Andric case AtomicRMWInst::Add: 7910b57cec5SDimitry Andric case AtomicRMWInst::Sub: { 7920b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 7930b57cec5SDimitry Andric // old value times the number of active lanes. 7940b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 7950b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 796e8d8bef9SDimitry Andric NewV = buildMul(B, V, Ctpop); 7970b57cec5SDimitry Andric break; 7980b57cec5SDimitry Andric } 7995f757f3fSDimitry Andric case AtomicRMWInst::FAdd: 8005f757f3fSDimitry Andric case AtomicRMWInst::FSub: { 8015f757f3fSDimitry Andric Value *const Ctpop = B.CreateIntCast( 8025f757f3fSDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 8035f757f3fSDimitry Andric Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 8045f757f3fSDimitry Andric NewV = B.CreateFMul(V, CtpopFP); 8055f757f3fSDimitry Andric break; 8065f757f3fSDimitry Andric } 8070b57cec5SDimitry Andric case AtomicRMWInst::And: 8080b57cec5SDimitry Andric case AtomicRMWInst::Or: 8090b57cec5SDimitry Andric case AtomicRMWInst::Max: 8100b57cec5SDimitry Andric case AtomicRMWInst::Min: 8110b57cec5SDimitry Andric case AtomicRMWInst::UMax: 8120b57cec5SDimitry Andric case AtomicRMWInst::UMin: 8135f757f3fSDimitry Andric case AtomicRMWInst::FMin: 8145f757f3fSDimitry Andric case AtomicRMWInst::FMax: 8150b57cec5SDimitry Andric // These operations with a uniform value are idempotent: doing the atomic 8160b57cec5SDimitry Andric // operation multiple times has the same effect as doing it once. 8170b57cec5SDimitry Andric NewV = V; 8180b57cec5SDimitry Andric break; 8190b57cec5SDimitry Andric 8200b57cec5SDimitry Andric case AtomicRMWInst::Xor: 8210b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 8220b57cec5SDimitry Andric // old value times the parity of the number of active lanes. 8230b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 8240b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 825e8d8bef9SDimitry Andric NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 8260b57cec5SDimitry Andric break; 8270b57cec5SDimitry Andric } 8280b57cec5SDimitry Andric } 8290b57cec5SDimitry Andric 8300b57cec5SDimitry Andric // We only want a single lane to enter our new control flow, and we do this 8310b57cec5SDimitry Andric // by checking if there are any active lanes below us. Only one lane will 8320b57cec5SDimitry Andric // have 0 active lanes below us, so that will be the only one to progress. 8335f757f3fSDimitry Andric Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 8340b57cec5SDimitry Andric 8350b57cec5SDimitry Andric // Store I's original basic block before we split the block. 836*0fca6ea1SDimitry Andric BasicBlock *const OriginalBB = I.getParent(); 8370b57cec5SDimitry Andric 8380b57cec5SDimitry Andric // We need to introduce some new control flow to force a single lane to be 8390b57cec5SDimitry Andric // active. We do this by splitting I's basic block at I, and introducing the 8400b57cec5SDimitry Andric // new block such that: 8410b57cec5SDimitry Andric // entry --> single_lane -\ 8420b57cec5SDimitry Andric // \------------------> exit 8430b57cec5SDimitry Andric Instruction *const SingleLaneTerminator = 84406c3fb27SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 8450b57cec5SDimitry Andric 84606c3fb27SDimitry Andric // At this point, we have split the I's block to allow one lane in wavefront 84706c3fb27SDimitry Andric // to update the precomputed reduced value. Also, completed the codegen for 84806c3fb27SDimitry Andric // new control flow i.e. iterative loop which perform reduction and scan using 84906c3fb27SDimitry Andric // ComputeLoop and ComputeEnd. 85006c3fb27SDimitry Andric // For the new control flow, we need to move branch instruction i.e. 85106c3fb27SDimitry Andric // terminator created during SplitBlockAndInsertIfThen from I's block to 85206c3fb27SDimitry Andric // ComputeEnd block. We also need to set up predecessor to next block when 85306c3fb27SDimitry Andric // single lane done updating the final reduced value. 85406c3fb27SDimitry Andric BasicBlock *Predecessor = nullptr; 85506c3fb27SDimitry Andric if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 85606c3fb27SDimitry Andric // Move terminator from I's block to ComputeEnd block. 857*0fca6ea1SDimitry Andric // 858*0fca6ea1SDimitry Andric // OriginalBB is known to have a branch as terminator because 859*0fca6ea1SDimitry Andric // SplitBlockAndInsertIfThen will have inserted one. 860*0fca6ea1SDimitry Andric BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 86106c3fb27SDimitry Andric B.SetInsertPoint(ComputeEnd); 86206c3fb27SDimitry Andric Terminator->removeFromParent(); 86306c3fb27SDimitry Andric B.Insert(Terminator); 86406c3fb27SDimitry Andric 86506c3fb27SDimitry Andric // Branch to ComputeLoop Block unconditionally from the I's block for 86606c3fb27SDimitry Andric // iterative approach. 867*0fca6ea1SDimitry Andric B.SetInsertPoint(OriginalBB); 86806c3fb27SDimitry Andric B.CreateBr(ComputeLoop); 86906c3fb27SDimitry Andric 87006c3fb27SDimitry Andric // Update the dominator tree for new control flow. 871*0fca6ea1SDimitry Andric SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 872*0fca6ea1SDimitry Andric {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 873*0fca6ea1SDimitry Andric {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 874*0fca6ea1SDimitry Andric 875*0fca6ea1SDimitry Andric // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 876*0fca6ea1SDimitry Andric // the DT edges as well. 877*0fca6ea1SDimitry Andric for (auto *Succ : Terminator->successors()) { 878*0fca6ea1SDimitry Andric DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 879*0fca6ea1SDimitry Andric DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 880*0fca6ea1SDimitry Andric } 881*0fca6ea1SDimitry Andric 882*0fca6ea1SDimitry Andric DTU.applyUpdates(DomTreeUpdates); 88306c3fb27SDimitry Andric 88406c3fb27SDimitry Andric Predecessor = ComputeEnd; 88506c3fb27SDimitry Andric } else { 886*0fca6ea1SDimitry Andric Predecessor = OriginalBB; 88706c3fb27SDimitry Andric } 8880b57cec5SDimitry Andric // Move the IR builder into single_lane next. 8890b57cec5SDimitry Andric B.SetInsertPoint(SingleLaneTerminator); 8900b57cec5SDimitry Andric 8910b57cec5SDimitry Andric // Clone the original atomic operation into single lane, replacing the 8920b57cec5SDimitry Andric // original value with our newly created one. 8930b57cec5SDimitry Andric Instruction *const NewI = I.clone(); 8940b57cec5SDimitry Andric B.Insert(NewI); 8950b57cec5SDimitry Andric NewI->setOperand(ValIdx, NewV); 8960b57cec5SDimitry Andric 8970b57cec5SDimitry Andric // Move the IR builder into exit next, and start inserting just before the 8980b57cec5SDimitry Andric // original instruction. 8990b57cec5SDimitry Andric B.SetInsertPoint(&I); 9000b57cec5SDimitry Andric 9018bcb0991SDimitry Andric if (NeedResult) { 9020b57cec5SDimitry Andric // Create a PHI node to get our new atomic result into the exit block. 9030b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 90406c3fb27SDimitry Andric PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 9050b57cec5SDimitry Andric PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 9060b57cec5SDimitry Andric 9070b57cec5SDimitry Andric // We need to broadcast the value who was the lowest active lane (the first 9080b57cec5SDimitry Andric // lane) to all other lanes in the wavefront. We use an intrinsic for this, 9090b57cec5SDimitry Andric // but have to handle 64-bit broadcasts with two calls to this intrinsic. 9100b57cec5SDimitry Andric Value *BroadcastI = nullptr; 911*0fca6ea1SDimitry Andric BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); 9120b57cec5SDimitry Andric 9130b57cec5SDimitry Andric // Now that we have the result of our single atomic operation, we need to 9148bcb0991SDimitry Andric // get our individual lane's slice into the result. We use the lane offset 9158bcb0991SDimitry Andric // we previously calculated combined with the atomic result value we got 9168bcb0991SDimitry Andric // from the first lane, to get our lane's index into the atomic result. 9170b57cec5SDimitry Andric Value *LaneOffset = nullptr; 9180b57cec5SDimitry Andric if (ValDivergent) { 91906c3fb27SDimitry Andric if (ScanImpl == ScanOptions::DPP) { 920fe6060f1SDimitry Andric LaneOffset = 921fe6060f1SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 92206c3fb27SDimitry Andric } else if (ScanImpl == ScanOptions::Iterative) { 92306c3fb27SDimitry Andric LaneOffset = ExclScan; 92406c3fb27SDimitry Andric } else { 92506c3fb27SDimitry Andric llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 92606c3fb27SDimitry Andric } 9270b57cec5SDimitry Andric } else { 9285f757f3fSDimitry Andric Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 9295f757f3fSDimitry Andric : B.CreateIntCast(Mbcnt, Ty, false); 9300b57cec5SDimitry Andric switch (Op) { 9310b57cec5SDimitry Andric default: 9320b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 9330b57cec5SDimitry Andric case AtomicRMWInst::Add: 9340b57cec5SDimitry Andric case AtomicRMWInst::Sub: 935e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, Mbcnt); 9360b57cec5SDimitry Andric break; 9370b57cec5SDimitry Andric case AtomicRMWInst::And: 9380b57cec5SDimitry Andric case AtomicRMWInst::Or: 9390b57cec5SDimitry Andric case AtomicRMWInst::Max: 9400b57cec5SDimitry Andric case AtomicRMWInst::Min: 9410b57cec5SDimitry Andric case AtomicRMWInst::UMax: 9420b57cec5SDimitry Andric case AtomicRMWInst::UMin: 9435f757f3fSDimitry Andric case AtomicRMWInst::FMin: 9445f757f3fSDimitry Andric case AtomicRMWInst::FMax: 9450b57cec5SDimitry Andric LaneOffset = B.CreateSelect(Cond, Identity, V); 9460b57cec5SDimitry Andric break; 9470b57cec5SDimitry Andric case AtomicRMWInst::Xor: 948e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 9490b57cec5SDimitry Andric break; 9505f757f3fSDimitry Andric case AtomicRMWInst::FAdd: 9515f757f3fSDimitry Andric case AtomicRMWInst::FSub: { 9525f757f3fSDimitry Andric LaneOffset = B.CreateFMul(V, Mbcnt); 9535f757f3fSDimitry Andric break; 9545f757f3fSDimitry Andric } 9550b57cec5SDimitry Andric } 9560b57cec5SDimitry Andric } 957*0fca6ea1SDimitry Andric Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 958*0fca6ea1SDimitry Andric if (isAtomicFloatingPointTy) { 959*0fca6ea1SDimitry Andric // For fadd/fsub the first active lane of LaneOffset should be the 960*0fca6ea1SDimitry Andric // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated 961*0fca6ea1SDimitry Andric // is V * +0.0 which might have the wrong sign or might be nan (if V is 962*0fca6ea1SDimitry Andric // inf or nan). 963*0fca6ea1SDimitry Andric // 964*0fca6ea1SDimitry Andric // For all floating point ops if the in-memory value was a nan then the 965*0fca6ea1SDimitry Andric // binop we just built might have quieted it or changed its payload. 966*0fca6ea1SDimitry Andric // 967*0fca6ea1SDimitry Andric // Correct all these problems by using BroadcastI as the result in the 968*0fca6ea1SDimitry Andric // first active lane. 969*0fca6ea1SDimitry Andric Result = B.CreateSelect(Cond, BroadcastI, Result); 970*0fca6ea1SDimitry Andric } 9710b57cec5SDimitry Andric 9720b57cec5SDimitry Andric if (IsPixelShader) { 9730b57cec5SDimitry Andric // Need a final PHI to reconverge to above the helper lane branch mask. 9745f757f3fSDimitry Andric B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 9750b57cec5SDimitry Andric 9760b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 977bdd1243dSDimitry Andric PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 9780b57cec5SDimitry Andric PHI->addIncoming(Result, I.getParent()); 9790b57cec5SDimitry Andric I.replaceAllUsesWith(PHI); 9800b57cec5SDimitry Andric } else { 9810b57cec5SDimitry Andric // Replace the original atomic instruction with the new one. 9820b57cec5SDimitry Andric I.replaceAllUsesWith(Result); 9830b57cec5SDimitry Andric } 9848bcb0991SDimitry Andric } 9850b57cec5SDimitry Andric 9860b57cec5SDimitry Andric // And delete the original. 9870b57cec5SDimitry Andric I.eraseFromParent(); 9880b57cec5SDimitry Andric } 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 9910b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 99206c3fb27SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 9930b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 9940b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 9950b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 9960b57cec5SDimitry Andric 99706c3fb27SDimitry Andric FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 99806c3fb27SDimitry Andric return new AMDGPUAtomicOptimizer(ScanStrategy); 9990b57cec5SDimitry Andric } 1000