1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 // Scan option None disables the Pass 164 if (ScanImpl == ScanOptions::None) { 165 return false; 166 } 167 168 visit(F); 169 170 const bool Changed = !ToReplace.empty(); 171 172 for (ReplacementInfo &Info : ToReplace) { 173 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 174 } 175 176 ToReplace.clear(); 177 178 return Changed; 179 } 180 181 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 182 // Early exit for unhandled address space atomic instructions. 183 switch (I.getPointerAddressSpace()) { 184 default: 185 return; 186 case AMDGPUAS::GLOBAL_ADDRESS: 187 case AMDGPUAS::LOCAL_ADDRESS: 188 break; 189 } 190 191 AtomicRMWInst::BinOp Op = I.getOperation(); 192 193 switch (Op) { 194 default: 195 return; 196 case AtomicRMWInst::Add: 197 case AtomicRMWInst::Sub: 198 case AtomicRMWInst::And: 199 case AtomicRMWInst::Or: 200 case AtomicRMWInst::Xor: 201 case AtomicRMWInst::Max: 202 case AtomicRMWInst::Min: 203 case AtomicRMWInst::UMax: 204 case AtomicRMWInst::UMin: 205 case AtomicRMWInst::FAdd: 206 case AtomicRMWInst::FSub: 207 case AtomicRMWInst::FMax: 208 case AtomicRMWInst::FMin: 209 break; 210 } 211 212 // Only 32 and 64 bit floating point atomic ops are supported. 213 if (AtomicRMWInst::isFPOperation(Op) && 214 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 215 return; 216 } 217 218 const unsigned PtrIdx = 0; 219 const unsigned ValIdx = 1; 220 221 // If the pointer operand is divergent, then each lane is doing an atomic 222 // operation on a different address, and we cannot optimize that. 223 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 224 return; 225 } 226 227 bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 228 229 if ((Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FSub) && 230 !I.use_empty()) { 231 // Disable the uniform return value calculation using fmul because it 232 // mishandles infinities, NaNs and signed zeros. FIXME. 233 ValDivergent = true; 234 } 235 236 // If the value operand is divergent, each lane is contributing a different 237 // value to the atomic calculation. We can only optimize divergent values if 238 // we have DPP available on our subtarget, and the atomic operation is 32 239 // bits. 240 if (ValDivergent && 241 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 242 return; 243 } 244 245 // If we get here, we can optimize the atomic using a single wavefront-wide 246 // atomic operation to do the calculation for the entire wavefront, so 247 // remember the instruction so we can come back to it. 248 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 249 250 ToReplace.push_back(Info); 251 } 252 253 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 254 AtomicRMWInst::BinOp Op; 255 256 switch (I.getIntrinsicID()) { 257 default: 258 return; 259 case Intrinsic::amdgcn_struct_buffer_atomic_add: 260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 261 case Intrinsic::amdgcn_raw_buffer_atomic_add: 262 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 263 Op = AtomicRMWInst::Add; 264 break; 265 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 267 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 268 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 269 Op = AtomicRMWInst::Sub; 270 break; 271 case Intrinsic::amdgcn_struct_buffer_atomic_and: 272 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 273 case Intrinsic::amdgcn_raw_buffer_atomic_and: 274 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 275 Op = AtomicRMWInst::And; 276 break; 277 case Intrinsic::amdgcn_struct_buffer_atomic_or: 278 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 279 case Intrinsic::amdgcn_raw_buffer_atomic_or: 280 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 281 Op = AtomicRMWInst::Or; 282 break; 283 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 284 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 285 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 286 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 287 Op = AtomicRMWInst::Xor; 288 break; 289 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 290 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 291 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 292 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 293 Op = AtomicRMWInst::Min; 294 break; 295 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 296 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 297 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 298 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 299 Op = AtomicRMWInst::UMin; 300 break; 301 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 302 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 303 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 305 Op = AtomicRMWInst::Max; 306 break; 307 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 308 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 309 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 310 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 311 Op = AtomicRMWInst::UMax; 312 break; 313 } 314 315 const unsigned ValIdx = 0; 316 317 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 318 319 // If the value operand is divergent, each lane is contributing a different 320 // value to the atomic calculation. We can only optimize divergent values if 321 // we have DPP available on our subtarget, and the atomic operation is 32 322 // bits. 323 if (ValDivergent && 324 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 325 return; 326 } 327 328 // If any of the other arguments to the intrinsic are divergent, we can't 329 // optimize the operation. 330 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 331 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 332 return; 333 } 334 } 335 336 // If we get here, we can optimize the atomic using a single wavefront-wide 337 // atomic operation to do the calculation for the entire wavefront, so 338 // remember the instruction so we can come back to it. 339 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 340 341 ToReplace.push_back(Info); 342 } 343 344 // Use the builder to create the non-atomic counterpart of the specified 345 // atomicrmw binary op. 346 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 347 Value *LHS, Value *RHS) { 348 CmpInst::Predicate Pred; 349 350 switch (Op) { 351 default: 352 llvm_unreachable("Unhandled atomic op"); 353 case AtomicRMWInst::Add: 354 return B.CreateBinOp(Instruction::Add, LHS, RHS); 355 case AtomicRMWInst::FAdd: 356 return B.CreateFAdd(LHS, RHS); 357 case AtomicRMWInst::Sub: 358 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 359 case AtomicRMWInst::FSub: 360 return B.CreateFSub(LHS, RHS); 361 case AtomicRMWInst::And: 362 return B.CreateBinOp(Instruction::And, LHS, RHS); 363 case AtomicRMWInst::Or: 364 return B.CreateBinOp(Instruction::Or, LHS, RHS); 365 case AtomicRMWInst::Xor: 366 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 367 368 case AtomicRMWInst::Max: 369 Pred = CmpInst::ICMP_SGT; 370 break; 371 case AtomicRMWInst::Min: 372 Pred = CmpInst::ICMP_SLT; 373 break; 374 case AtomicRMWInst::UMax: 375 Pred = CmpInst::ICMP_UGT; 376 break; 377 case AtomicRMWInst::UMin: 378 Pred = CmpInst::ICMP_ULT; 379 break; 380 case AtomicRMWInst::FMax: 381 return B.CreateMaxNum(LHS, RHS); 382 case AtomicRMWInst::FMin: 383 return B.CreateMinNum(LHS, RHS); 384 } 385 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 386 return B.CreateSelect(Cond, LHS, RHS); 387 } 388 389 // Use the builder to create a reduction of V across the wavefront, with all 390 // lanes active, returning the same result in all lanes. 391 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 392 AtomicRMWInst::BinOp Op, 393 Value *V, 394 Value *const Identity) const { 395 Type *AtomicTy = V->getType(); 396 Module *M = B.GetInsertBlock()->getModule(); 397 Function *UpdateDPP = 398 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 399 400 // Reduce within each row of 16 lanes. 401 for (unsigned Idx = 0; Idx < 4; Idx++) { 402 V = buildNonAtomicBinOp( 403 B, Op, V, 404 B.CreateCall(UpdateDPP, 405 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 406 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 407 } 408 409 // Reduce within each pair of rows (i.e. 32 lanes). 410 assert(ST->hasPermLaneX16()); 411 Value *Permlanex16Call = B.CreateIntrinsic( 412 V->getType(), Intrinsic::amdgcn_permlanex16, 413 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 414 V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); 415 if (ST->isWave32()) { 416 return V; 417 } 418 419 if (ST->hasPermLane64()) { 420 // Reduce across the upper and lower 32 lanes. 421 Value *Permlane64Call = 422 B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); 423 return buildNonAtomicBinOp(B, Op, V, Permlane64Call); 424 } 425 426 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 427 // combine them with a scalar operation. 428 Function *ReadLane = 429 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 430 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 431 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 432 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 433 } 434 435 // Use the builder to create an inclusive scan of V across the wavefront, with 436 // all lanes active. 437 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 438 AtomicRMWInst::BinOp Op, Value *V, 439 Value *Identity) const { 440 Type *AtomicTy = V->getType(); 441 Module *M = B.GetInsertBlock()->getModule(); 442 Function *UpdateDPP = 443 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 444 445 for (unsigned Idx = 0; Idx < 4; Idx++) { 446 V = buildNonAtomicBinOp( 447 B, Op, V, 448 B.CreateCall(UpdateDPP, 449 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 450 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 451 } 452 if (ST->hasDPPBroadcasts()) { 453 // GFX9 has DPP row broadcast operations. 454 V = buildNonAtomicBinOp( 455 B, Op, V, 456 B.CreateCall(UpdateDPP, 457 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 458 B.getInt32(0xf), B.getFalse()})); 459 V = buildNonAtomicBinOp( 460 B, Op, V, 461 B.CreateCall(UpdateDPP, 462 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 463 B.getInt32(0xf), B.getFalse()})); 464 } else { 465 // On GFX10 all DPP operations are confined to a single row. To get cross- 466 // row operations we have to use permlane or readlane. 467 468 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 469 // 48..63). 470 assert(ST->hasPermLaneX16()); 471 Value *PermX = B.CreateIntrinsic( 472 V->getType(), Intrinsic::amdgcn_permlanex16, 473 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 474 475 Value *UpdateDPPCall = B.CreateCall( 476 UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 477 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); 478 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 479 480 if (!ST->isWave32()) { 481 // Combine lane 31 into lanes 32..63. 482 Value *const Lane31 = B.CreateIntrinsic( 483 V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 484 485 Value *UpdateDPPCall = B.CreateCall( 486 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 487 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 488 489 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 490 } 491 } 492 return V; 493 } 494 495 // Use the builder to create a shift right of V across the wavefront, with all 496 // lanes active, to turn an inclusive scan into an exclusive scan. 497 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 498 Value *Identity) const { 499 Type *AtomicTy = V->getType(); 500 Module *M = B.GetInsertBlock()->getModule(); 501 Function *UpdateDPP = 502 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 503 if (ST->hasDPPWavefrontShifts()) { 504 // GFX9 has DPP wavefront shift operations. 505 V = B.CreateCall(UpdateDPP, 506 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 507 B.getInt32(0xf), B.getFalse()}); 508 } else { 509 Function *ReadLane = 510 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 511 Function *WriteLane = 512 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); 513 514 // On GFX10 all DPP operations are confined to a single row. To get cross- 515 // row operations we have to use permlane or readlane. 516 Value *Old = V; 517 V = B.CreateCall(UpdateDPP, 518 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 519 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 520 521 // Copy the old lane 15 to the new lane 16. 522 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 523 B.getInt32(16), V}); 524 525 if (!ST->isWave32()) { 526 // Copy the old lane 31 to the new lane 32. 527 V = B.CreateCall( 528 WriteLane, 529 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 530 531 // Copy the old lane 47 to the new lane 48. 532 V = B.CreateCall( 533 WriteLane, 534 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 535 } 536 } 537 538 return V; 539 } 540 541 // Use the builder to create an exclusive scan and compute the final reduced 542 // value using an iterative approach. This provides an alternative 543 // implementation to DPP which uses WMM for scan computations. This API iterate 544 // over active lanes to read, compute and update the value using 545 // readlane and writelane intrinsics. 546 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 547 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 548 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 549 auto *Ty = I.getType(); 550 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 551 auto *EntryBB = I.getParent(); 552 auto NeedResult = !I.use_empty(); 553 554 auto *Ballot = 555 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 556 557 // Start inserting instructions for ComputeLoop block 558 B.SetInsertPoint(ComputeLoop); 559 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 560 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 561 Accumulator->addIncoming(Identity, EntryBB); 562 PHINode *OldValuePhi = nullptr; 563 if (NeedResult) { 564 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 565 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 566 } 567 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 568 ActiveBits->addIncoming(Ballot, EntryBB); 569 570 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 571 auto *FF1 = 572 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 573 574 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); 575 576 // Get the value required for atomic operation 577 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 578 {V, LaneIdxInt}); 579 580 // Perform writelane if intermediate scan results are required later in the 581 // kernel computations 582 Value *OldValue = nullptr; 583 if (NeedResult) { 584 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, 585 {Accumulator, LaneIdxInt, OldValuePhi}); 586 OldValuePhi->addIncoming(OldValue, ComputeLoop); 587 } 588 589 // Accumulate the results 590 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 591 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 592 593 // Set bit to zero of current active lane so that for next iteration llvm.cttz 594 // return the next active lane 595 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 596 597 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 598 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 599 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 600 601 // Branch out of the loop when all lanes are processed. 602 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 603 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 604 605 B.SetInsertPoint(ComputeEnd); 606 607 return {OldValue, NewAccumulator}; 608 } 609 610 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 611 AtomicRMWInst::BinOp Op) { 612 LLVMContext &C = Ty->getContext(); 613 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 614 switch (Op) { 615 default: 616 llvm_unreachable("Unhandled atomic op"); 617 case AtomicRMWInst::Add: 618 case AtomicRMWInst::Sub: 619 case AtomicRMWInst::Or: 620 case AtomicRMWInst::Xor: 621 case AtomicRMWInst::UMax: 622 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 623 case AtomicRMWInst::And: 624 case AtomicRMWInst::UMin: 625 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 626 case AtomicRMWInst::Max: 627 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 628 case AtomicRMWInst::Min: 629 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 630 case AtomicRMWInst::FAdd: 631 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 632 case AtomicRMWInst::FSub: 633 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 634 case AtomicRMWInst::FMin: 635 case AtomicRMWInst::FMax: 636 // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the 637 // closest thing they have to an identity, but it still does not preserve 638 // the difference between quiet and signaling NaNs or NaNs with different 639 // payloads. 640 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); 641 } 642 } 643 644 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 645 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 646 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 647 } 648 649 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 650 AtomicRMWInst::BinOp Op, 651 unsigned ValIdx, 652 bool ValDivergent) const { 653 // Start building just before the instruction. 654 IRBuilder<> B(&I); 655 656 if (AtomicRMWInst::isFPOperation(Op)) { 657 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 658 } 659 660 // If we are in a pixel shader, because of how we have to mask out helper 661 // lane invocations, we need to record the entry and exit BB's. 662 BasicBlock *PixelEntryBB = nullptr; 663 BasicBlock *PixelExitBB = nullptr; 664 665 // If we're optimizing an atomic within a pixel shader, we need to wrap the 666 // entire atomic operation in a helper-lane check. We do not want any helper 667 // lanes that are around only for the purposes of derivatives to take part 668 // in any cross-lane communication, and we use a branch on whether the lane is 669 // live to do this. 670 if (IsPixelShader) { 671 // Record I's original position as the entry block. 672 PixelEntryBB = I.getParent(); 673 674 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 675 Instruction *const NonHelperTerminator = 676 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 677 678 // Record I's new position as the exit block. 679 PixelExitBB = I.getParent(); 680 681 I.moveBefore(NonHelperTerminator); 682 B.SetInsertPoint(&I); 683 } 684 685 Type *const Ty = I.getType(); 686 Type *Int32Ty = B.getInt32Ty(); 687 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 688 [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 689 690 // This is the value in the atomic operation we need to combine in order to 691 // reduce the number of atomic operations. 692 Value *V = I.getOperand(ValIdx); 693 694 // We need to know how many lanes are active within the wavefront, and we do 695 // this by doing a ballot of active lanes. 696 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 697 CallInst *const Ballot = 698 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 699 700 // We need to know how many lanes are active within the wavefront that are 701 // below us. If we counted each lane linearly starting from 0, a lane is 702 // below us only if its associated index was less than ours. We do this by 703 // using the mbcnt intrinsic. 704 Value *Mbcnt; 705 if (ST->isWave32()) { 706 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 707 {Ballot, B.getInt32(0)}); 708 } else { 709 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 710 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 711 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 712 {ExtractLo, B.getInt32(0)}); 713 Mbcnt = 714 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 715 } 716 717 Function *F = I.getFunction(); 718 LLVMContext &C = F->getContext(); 719 720 // For atomic sub, perform scan with add operation and allow one lane to 721 // subtract the reduced value later. 722 AtomicRMWInst::BinOp ScanOp = Op; 723 if (Op == AtomicRMWInst::Sub) { 724 ScanOp = AtomicRMWInst::Add; 725 } else if (Op == AtomicRMWInst::FSub) { 726 ScanOp = AtomicRMWInst::FAdd; 727 } 728 Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 729 730 Value *ExclScan = nullptr; 731 Value *NewV = nullptr; 732 733 const bool NeedResult = !I.use_empty(); 734 735 BasicBlock *ComputeLoop = nullptr; 736 BasicBlock *ComputeEnd = nullptr; 737 // If we have a divergent value in each lane, we need to combine the value 738 // using DPP. 739 if (ValDivergent) { 740 if (ScanImpl == ScanOptions::DPP) { 741 // First we need to set all inactive invocations to the identity value, so 742 // that they can correctly contribute to the final result. 743 NewV = 744 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 745 if (!NeedResult && ST->hasPermLaneX16()) { 746 // On GFX10 the permlanex16 instruction helps us build a reduction 747 // without too many readlanes and writelanes, which are generally bad 748 // for performance. 749 NewV = buildReduction(B, ScanOp, NewV, Identity); 750 } else { 751 NewV = buildScan(B, ScanOp, NewV, Identity); 752 if (NeedResult) 753 ExclScan = buildShiftRight(B, NewV, Identity); 754 // Read the value from the last lane, which has accumulated the values 755 // of each active lane in the wavefront. This will be our new value 756 // which we will provide to the atomic operation. 757 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 758 assert(TyBitWidth == 32); 759 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, 760 {NewV, LastLaneIdx}); 761 } 762 // Finally mark the readlanes in the WWM section. 763 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 764 } else if (ScanImpl == ScanOptions::Iterative) { 765 // Alternative implementation for scan 766 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 767 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 768 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 769 ComputeLoop, ComputeEnd); 770 } else { 771 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 772 } 773 } else { 774 switch (Op) { 775 default: 776 llvm_unreachable("Unhandled atomic op"); 777 778 case AtomicRMWInst::Add: 779 case AtomicRMWInst::Sub: { 780 // The new value we will be contributing to the atomic operation is the 781 // old value times the number of active lanes. 782 Value *const Ctpop = B.CreateIntCast( 783 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 784 NewV = buildMul(B, V, Ctpop); 785 break; 786 } 787 case AtomicRMWInst::FAdd: 788 case AtomicRMWInst::FSub: { 789 Value *const Ctpop = B.CreateIntCast( 790 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 791 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 792 NewV = B.CreateFMul(V, CtpopFP); 793 break; 794 } 795 case AtomicRMWInst::And: 796 case AtomicRMWInst::Or: 797 case AtomicRMWInst::Max: 798 case AtomicRMWInst::Min: 799 case AtomicRMWInst::UMax: 800 case AtomicRMWInst::UMin: 801 case AtomicRMWInst::FMin: 802 case AtomicRMWInst::FMax: 803 // These operations with a uniform value are idempotent: doing the atomic 804 // operation multiple times has the same effect as doing it once. 805 NewV = V; 806 break; 807 808 case AtomicRMWInst::Xor: 809 // The new value we will be contributing to the atomic operation is the 810 // old value times the parity of the number of active lanes. 811 Value *const Ctpop = B.CreateIntCast( 812 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 813 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 814 break; 815 } 816 } 817 818 // We only want a single lane to enter our new control flow, and we do this 819 // by checking if there are any active lanes below us. Only one lane will 820 // have 0 active lanes below us, so that will be the only one to progress. 821 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 822 823 // Store I's original basic block before we split the block. 824 BasicBlock *const OriginalBB = I.getParent(); 825 826 // We need to introduce some new control flow to force a single lane to be 827 // active. We do this by splitting I's basic block at I, and introducing the 828 // new block such that: 829 // entry --> single_lane -\ 830 // \------------------> exit 831 Instruction *const SingleLaneTerminator = 832 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 833 834 // At this point, we have split the I's block to allow one lane in wavefront 835 // to update the precomputed reduced value. Also, completed the codegen for 836 // new control flow i.e. iterative loop which perform reduction and scan using 837 // ComputeLoop and ComputeEnd. 838 // For the new control flow, we need to move branch instruction i.e. 839 // terminator created during SplitBlockAndInsertIfThen from I's block to 840 // ComputeEnd block. We also need to set up predecessor to next block when 841 // single lane done updating the final reduced value. 842 BasicBlock *Predecessor = nullptr; 843 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 844 // Move terminator from I's block to ComputeEnd block. 845 // 846 // OriginalBB is known to have a branch as terminator because 847 // SplitBlockAndInsertIfThen will have inserted one. 848 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 849 B.SetInsertPoint(ComputeEnd); 850 Terminator->removeFromParent(); 851 B.Insert(Terminator); 852 853 // Branch to ComputeLoop Block unconditionally from the I's block for 854 // iterative approach. 855 B.SetInsertPoint(OriginalBB); 856 B.CreateBr(ComputeLoop); 857 858 // Update the dominator tree for new control flow. 859 SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 860 {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 861 {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 862 863 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 864 // the DT edges as well. 865 for (auto *Succ : Terminator->successors()) { 866 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 867 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 868 } 869 870 DTU.applyUpdates(DomTreeUpdates); 871 872 Predecessor = ComputeEnd; 873 } else { 874 Predecessor = OriginalBB; 875 } 876 // Move the IR builder into single_lane next. 877 B.SetInsertPoint(SingleLaneTerminator); 878 879 // Clone the original atomic operation into single lane, replacing the 880 // original value with our newly created one. 881 Instruction *const NewI = I.clone(); 882 B.Insert(NewI); 883 NewI->setOperand(ValIdx, NewV); 884 885 // Move the IR builder into exit next, and start inserting just before the 886 // original instruction. 887 B.SetInsertPoint(&I); 888 889 if (NeedResult) { 890 // Create a PHI node to get our new atomic result into the exit block. 891 PHINode *const PHI = B.CreatePHI(Ty, 2); 892 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 893 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 894 895 // We need to broadcast the value who was the lowest active lane (the first 896 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 897 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 898 Value *BroadcastI = nullptr; 899 BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); 900 901 // Now that we have the result of our single atomic operation, we need to 902 // get our individual lane's slice into the result. We use the lane offset 903 // we previously calculated combined with the atomic result value we got 904 // from the first lane, to get our lane's index into the atomic result. 905 Value *LaneOffset = nullptr; 906 if (ValDivergent) { 907 if (ScanImpl == ScanOptions::DPP) { 908 LaneOffset = 909 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 910 } else if (ScanImpl == ScanOptions::Iterative) { 911 LaneOffset = ExclScan; 912 } else { 913 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 914 } 915 } else { 916 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 917 : B.CreateIntCast(Mbcnt, Ty, false); 918 switch (Op) { 919 default: 920 llvm_unreachable("Unhandled atomic op"); 921 case AtomicRMWInst::Add: 922 case AtomicRMWInst::Sub: 923 LaneOffset = buildMul(B, V, Mbcnt); 924 break; 925 case AtomicRMWInst::And: 926 case AtomicRMWInst::Or: 927 case AtomicRMWInst::Max: 928 case AtomicRMWInst::Min: 929 case AtomicRMWInst::UMax: 930 case AtomicRMWInst::UMin: 931 case AtomicRMWInst::FMin: 932 case AtomicRMWInst::FMax: 933 LaneOffset = B.CreateSelect(Cond, Identity, V); 934 break; 935 case AtomicRMWInst::Xor: 936 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 937 break; 938 case AtomicRMWInst::FAdd: 939 case AtomicRMWInst::FSub: { 940 // FIXME: This path is currently disabled in visitAtomicRMWInst because 941 // of problems calculating the first active lane of the result (where 942 // Mbcnt is 0): 943 // - If V is infinity or NaN we will return NaN instead of BroadcastI. 944 // - If BroadcastI is -0.0 and V is positive we will return +0.0 instead 945 // of -0.0. 946 LaneOffset = B.CreateFMul(V, Mbcnt); 947 break; 948 } 949 } 950 } 951 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 952 953 if (IsPixelShader) { 954 // Need a final PHI to reconverge to above the helper lane branch mask. 955 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 956 957 PHINode *const PHI = B.CreatePHI(Ty, 2); 958 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 959 PHI->addIncoming(Result, I.getParent()); 960 I.replaceAllUsesWith(PHI); 961 } else { 962 // Replace the original atomic instruction with the new one. 963 I.replaceAllUsesWith(Result); 964 } 965 } 966 967 // And delete the original. 968 I.eraseFromParent(); 969 } 970 971 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 972 "AMDGPU atomic optimizations", false, false) 973 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 974 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 975 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 976 "AMDGPU atomic optimizations", false, false) 977 978 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 979 return new AMDGPUAtomicOptimizer(ScanStrategy); 980 } 981