1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getParent()->getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getParent()->getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 visit(F); 164 165 const bool Changed = !ToReplace.empty(); 166 167 for (ReplacementInfo &Info : ToReplace) { 168 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 169 } 170 171 ToReplace.clear(); 172 173 return Changed; 174 } 175 176 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 177 // Early exit for unhandled address space atomic instructions. 178 switch (I.getPointerAddressSpace()) { 179 default: 180 return; 181 case AMDGPUAS::GLOBAL_ADDRESS: 182 case AMDGPUAS::LOCAL_ADDRESS: 183 break; 184 } 185 186 AtomicRMWInst::BinOp Op = I.getOperation(); 187 188 switch (Op) { 189 default: 190 return; 191 case AtomicRMWInst::Add: 192 case AtomicRMWInst::Sub: 193 case AtomicRMWInst::And: 194 case AtomicRMWInst::Or: 195 case AtomicRMWInst::Xor: 196 case AtomicRMWInst::Max: 197 case AtomicRMWInst::Min: 198 case AtomicRMWInst::UMax: 199 case AtomicRMWInst::UMin: 200 break; 201 } 202 203 const unsigned PtrIdx = 0; 204 const unsigned ValIdx = 1; 205 206 // If the pointer operand is divergent, then each lane is doing an atomic 207 // operation on a different address, and we cannot optimize that. 208 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 209 return; 210 } 211 212 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 213 214 // If the value operand is divergent, each lane is contributing a different 215 // value to the atomic calculation. We can only optimize divergent values if 216 // we have DPP available on our subtarget, and the atomic operation is 32 217 // bits. 218 if (ValDivergent && 219 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 220 return; 221 } 222 223 // If we get here, we can optimize the atomic using a single wavefront-wide 224 // atomic operation to do the calculation for the entire wavefront, so 225 // remember the instruction so we can come back to it. 226 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 227 228 ToReplace.push_back(Info); 229 } 230 231 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 232 AtomicRMWInst::BinOp Op; 233 234 switch (I.getIntrinsicID()) { 235 default: 236 return; 237 case Intrinsic::amdgcn_buffer_atomic_add: 238 case Intrinsic::amdgcn_struct_buffer_atomic_add: 239 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 240 case Intrinsic::amdgcn_raw_buffer_atomic_add: 241 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 242 Op = AtomicRMWInst::Add; 243 break; 244 case Intrinsic::amdgcn_buffer_atomic_sub: 245 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 247 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 248 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 249 Op = AtomicRMWInst::Sub; 250 break; 251 case Intrinsic::amdgcn_buffer_atomic_and: 252 case Intrinsic::amdgcn_struct_buffer_atomic_and: 253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 254 case Intrinsic::amdgcn_raw_buffer_atomic_and: 255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 256 Op = AtomicRMWInst::And; 257 break; 258 case Intrinsic::amdgcn_buffer_atomic_or: 259 case Intrinsic::amdgcn_struct_buffer_atomic_or: 260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 261 case Intrinsic::amdgcn_raw_buffer_atomic_or: 262 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 263 Op = AtomicRMWInst::Or; 264 break; 265 case Intrinsic::amdgcn_buffer_atomic_xor: 266 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 268 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 270 Op = AtomicRMWInst::Xor; 271 break; 272 case Intrinsic::amdgcn_buffer_atomic_smin: 273 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 275 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 277 Op = AtomicRMWInst::Min; 278 break; 279 case Intrinsic::amdgcn_buffer_atomic_umin: 280 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 282 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 284 Op = AtomicRMWInst::UMin; 285 break; 286 case Intrinsic::amdgcn_buffer_atomic_smax: 287 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 289 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 290 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 291 Op = AtomicRMWInst::Max; 292 break; 293 case Intrinsic::amdgcn_buffer_atomic_umax: 294 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 296 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 298 Op = AtomicRMWInst::UMax; 299 break; 300 } 301 302 const unsigned ValIdx = 0; 303 304 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 305 306 // If the value operand is divergent, each lane is contributing a different 307 // value to the atomic calculation. We can only optimize divergent values if 308 // we have DPP available on our subtarget, and the atomic operation is 32 309 // bits. 310 if (ValDivergent && 311 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 312 return; 313 } 314 315 // If any of the other arguments to the intrinsic are divergent, we can't 316 // optimize the operation. 317 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 318 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 319 return; 320 } 321 } 322 323 // If we get here, we can optimize the atomic using a single wavefront-wide 324 // atomic operation to do the calculation for the entire wavefront, so 325 // remember the instruction so we can come back to it. 326 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 327 328 ToReplace.push_back(Info); 329 } 330 331 // Use the builder to create the non-atomic counterpart of the specified 332 // atomicrmw binary op. 333 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 334 Value *LHS, Value *RHS) { 335 CmpInst::Predicate Pred; 336 337 switch (Op) { 338 default: 339 llvm_unreachable("Unhandled atomic op"); 340 case AtomicRMWInst::Add: 341 return B.CreateBinOp(Instruction::Add, LHS, RHS); 342 case AtomicRMWInst::Sub: 343 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 344 case AtomicRMWInst::And: 345 return B.CreateBinOp(Instruction::And, LHS, RHS); 346 case AtomicRMWInst::Or: 347 return B.CreateBinOp(Instruction::Or, LHS, RHS); 348 case AtomicRMWInst::Xor: 349 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 350 351 case AtomicRMWInst::Max: 352 Pred = CmpInst::ICMP_SGT; 353 break; 354 case AtomicRMWInst::Min: 355 Pred = CmpInst::ICMP_SLT; 356 break; 357 case AtomicRMWInst::UMax: 358 Pred = CmpInst::ICMP_UGT; 359 break; 360 case AtomicRMWInst::UMin: 361 Pred = CmpInst::ICMP_ULT; 362 break; 363 } 364 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 365 return B.CreateSelect(Cond, LHS, RHS); 366 } 367 368 // Use the builder to create a reduction of V across the wavefront, with all 369 // lanes active, returning the same result in all lanes. 370 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 371 AtomicRMWInst::BinOp Op, 372 Value *V, 373 Value *const Identity) const { 374 Type *const Ty = V->getType(); 375 Module *M = B.GetInsertBlock()->getModule(); 376 Function *UpdateDPP = 377 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 378 379 // Reduce within each row of 16 lanes. 380 for (unsigned Idx = 0; Idx < 4; Idx++) { 381 V = buildNonAtomicBinOp( 382 B, Op, V, 383 B.CreateCall(UpdateDPP, 384 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 385 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 386 } 387 388 // Reduce within each pair of rows (i.e. 32 lanes). 389 assert(ST->hasPermLaneX16()); 390 V = buildNonAtomicBinOp( 391 B, Op, V, 392 B.CreateIntrinsic( 393 Intrinsic::amdgcn_permlanex16, {}, 394 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 395 396 if (ST->isWave32()) 397 return V; 398 399 if (ST->hasPermLane64()) { 400 // Reduce across the upper and lower 32 lanes. 401 return buildNonAtomicBinOp( 402 B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); 403 } 404 405 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 406 // combine them with a scalar operation. 407 Function *ReadLane = 408 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 409 Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 410 Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 411 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 412 } 413 414 // Use the builder to create an inclusive scan of V across the wavefront, with 415 // all lanes active. 416 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 417 AtomicRMWInst::BinOp Op, Value *V, 418 Value *const Identity) const { 419 Type *const Ty = V->getType(); 420 Module *M = B.GetInsertBlock()->getModule(); 421 Function *UpdateDPP = 422 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 423 424 for (unsigned Idx = 0; Idx < 4; Idx++) { 425 V = buildNonAtomicBinOp( 426 B, Op, V, 427 B.CreateCall(UpdateDPP, 428 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 429 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 430 } 431 if (ST->hasDPPBroadcasts()) { 432 // GFX9 has DPP row broadcast operations. 433 V = buildNonAtomicBinOp( 434 B, Op, V, 435 B.CreateCall(UpdateDPP, 436 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 437 B.getInt32(0xf), B.getFalse()})); 438 V = buildNonAtomicBinOp( 439 B, Op, V, 440 B.CreateCall(UpdateDPP, 441 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 442 B.getInt32(0xf), B.getFalse()})); 443 } else { 444 // On GFX10 all DPP operations are confined to a single row. To get cross- 445 // row operations we have to use permlane or readlane. 446 447 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 448 // 48..63). 449 assert(ST->hasPermLaneX16()); 450 Value *const PermX = B.CreateIntrinsic( 451 Intrinsic::amdgcn_permlanex16, {}, 452 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 453 V = buildNonAtomicBinOp( 454 B, Op, V, 455 B.CreateCall(UpdateDPP, 456 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 457 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 458 if (!ST->isWave32()) { 459 // Combine lane 31 into lanes 32..63. 460 Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 461 {V, B.getInt32(31)}); 462 V = buildNonAtomicBinOp( 463 B, Op, V, 464 B.CreateCall(UpdateDPP, 465 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 466 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 467 } 468 } 469 return V; 470 } 471 472 // Use the builder to create a shift right of V across the wavefront, with all 473 // lanes active, to turn an inclusive scan into an exclusive scan. 474 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 475 Value *const Identity) const { 476 Type *const Ty = V->getType(); 477 Module *M = B.GetInsertBlock()->getModule(); 478 Function *UpdateDPP = 479 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 480 481 if (ST->hasDPPWavefrontShifts()) { 482 // GFX9 has DPP wavefront shift operations. 483 V = B.CreateCall(UpdateDPP, 484 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 485 B.getInt32(0xf), B.getFalse()}); 486 } else { 487 Function *ReadLane = 488 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 489 Function *WriteLane = 490 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 491 492 // On GFX10 all DPP operations are confined to a single row. To get cross- 493 // row operations we have to use permlane or readlane. 494 Value *Old = V; 495 V = B.CreateCall(UpdateDPP, 496 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 497 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 498 499 // Copy the old lane 15 to the new lane 16. 500 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 501 B.getInt32(16), V}); 502 503 if (!ST->isWave32()) { 504 // Copy the old lane 31 to the new lane 32. 505 V = B.CreateCall( 506 WriteLane, 507 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 508 509 // Copy the old lane 47 to the new lane 48. 510 V = B.CreateCall( 511 WriteLane, 512 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 513 } 514 } 515 516 return V; 517 } 518 519 // Use the builder to create an exclusive scan and compute the final reduced 520 // value using an iterative approach. This provides an alternative 521 // implementation to DPP which uses WMM for scan computations. This API iterate 522 // over active lanes to read, compute and update the value using 523 // readlane and writelane intrinsics. 524 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 525 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 526 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 527 528 auto *Ty = I.getType(); 529 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 530 auto *EntryBB = I.getParent(); 531 auto NeedResult = !I.use_empty(); 532 533 auto *Ballot = 534 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 535 536 // Start inserting instructions for ComputeLoop block 537 B.SetInsertPoint(ComputeLoop); 538 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 539 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 540 Accumulator->addIncoming(Identity, EntryBB); 541 PHINode *OldValuePhi = nullptr; 542 if (NeedResult) { 543 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 544 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 545 } 546 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 547 ActiveBits->addIncoming(Ballot, EntryBB); 548 549 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 550 auto *FF1 = 551 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 552 auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); 553 554 // Get the value required for atomic operation 555 auto *LaneValue = 556 B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); 557 558 // Perform writelane if intermediate scan results are required later in the 559 // kernel computations 560 Value *OldValue = nullptr; 561 if (NeedResult) { 562 OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, 563 {Accumulator, LaneIdxInt, OldValuePhi}); 564 OldValuePhi->addIncoming(OldValue, ComputeLoop); 565 } 566 567 // Accumulate the results 568 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 569 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 570 571 // Set bit to zero of current active lane so that for next iteration llvm.cttz 572 // return the next active lane 573 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 574 575 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 576 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 577 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 578 579 // Branch out of the loop when all lanes are processed. 580 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 581 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 582 583 B.SetInsertPoint(ComputeEnd); 584 585 return {OldValue, NewAccumulator}; 586 } 587 588 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 589 unsigned BitWidth) { 590 switch (Op) { 591 default: 592 llvm_unreachable("Unhandled atomic op"); 593 case AtomicRMWInst::Add: 594 case AtomicRMWInst::Sub: 595 case AtomicRMWInst::Or: 596 case AtomicRMWInst::Xor: 597 case AtomicRMWInst::UMax: 598 return APInt::getMinValue(BitWidth); 599 case AtomicRMWInst::And: 600 case AtomicRMWInst::UMin: 601 return APInt::getMaxValue(BitWidth); 602 case AtomicRMWInst::Max: 603 return APInt::getSignedMinValue(BitWidth); 604 case AtomicRMWInst::Min: 605 return APInt::getSignedMaxValue(BitWidth); 606 } 607 } 608 609 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 610 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 611 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 612 } 613 614 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 615 AtomicRMWInst::BinOp Op, 616 unsigned ValIdx, 617 bool ValDivergent) const { 618 // Start building just before the instruction. 619 IRBuilder<> B(&I); 620 621 // If we are in a pixel shader, because of how we have to mask out helper 622 // lane invocations, we need to record the entry and exit BB's. 623 BasicBlock *PixelEntryBB = nullptr; 624 BasicBlock *PixelExitBB = nullptr; 625 626 // If we're optimizing an atomic within a pixel shader, we need to wrap the 627 // entire atomic operation in a helper-lane check. We do not want any helper 628 // lanes that are around only for the purposes of derivatives to take part 629 // in any cross-lane communication, and we use a branch on whether the lane is 630 // live to do this. 631 if (IsPixelShader) { 632 // Record I's original position as the entry block. 633 PixelEntryBB = I.getParent(); 634 635 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 636 Instruction *const NonHelperTerminator = 637 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 638 639 // Record I's new position as the exit block. 640 PixelExitBB = I.getParent(); 641 642 I.moveBefore(NonHelperTerminator); 643 B.SetInsertPoint(&I); 644 } 645 646 Type *const Ty = I.getType(); 647 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 648 auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 649 650 // This is the value in the atomic operation we need to combine in order to 651 // reduce the number of atomic operations. 652 Value *const V = I.getOperand(ValIdx); 653 654 // We need to know how many lanes are active within the wavefront, and we do 655 // this by doing a ballot of active lanes. 656 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 657 CallInst *const Ballot = 658 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 659 660 // We need to know how many lanes are active within the wavefront that are 661 // below us. If we counted each lane linearly starting from 0, a lane is 662 // below us only if its associated index was less than ours. We do this by 663 // using the mbcnt intrinsic. 664 Value *Mbcnt; 665 if (ST->isWave32()) { 666 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 667 {Ballot, B.getInt32(0)}); 668 } else { 669 Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 670 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 671 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 672 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 673 {ExtractLo, B.getInt32(0)}); 674 Mbcnt = 675 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 676 } 677 Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 678 679 Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 680 681 Value *ExclScan = nullptr; 682 Value *NewV = nullptr; 683 684 const bool NeedResult = !I.use_empty(); 685 686 Function *F = I.getFunction(); 687 LLVMContext &C = F->getContext(); 688 BasicBlock *ComputeLoop = nullptr; 689 BasicBlock *ComputeEnd = nullptr; 690 // If we have a divergent value in each lane, we need to combine the value 691 // using DPP. 692 if (ValDivergent) { 693 const AtomicRMWInst::BinOp ScanOp = 694 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 695 if (ScanImpl == ScanOptions::DPP) { 696 // First we need to set all inactive invocations to the identity value, so 697 // that they can correctly contribute to the final result. 698 NewV = 699 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 700 const AtomicRMWInst::BinOp ScanOp = 701 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 702 if (!NeedResult && ST->hasPermLaneX16()) { 703 // On GFX10 the permlanex16 instruction helps us build a reduction 704 // without too many readlanes and writelanes, which are generally bad 705 // for performance. 706 NewV = buildReduction(B, ScanOp, NewV, Identity); 707 } else { 708 NewV = buildScan(B, ScanOp, NewV, Identity); 709 if (NeedResult) 710 ExclScan = buildShiftRight(B, NewV, Identity); 711 // Read the value from the last lane, which has accumulated the values 712 // of each active lane in the wavefront. This will be our new value 713 // which we will provide to the atomic operation. 714 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 715 assert(TyBitWidth == 32); 716 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 717 {NewV, LastLaneIdx}); 718 } 719 // Finally mark the readlanes in the WWM section. 720 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 721 } else { 722 // Alternative implementation for scan 723 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 724 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 725 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 726 ComputeLoop, ComputeEnd); 727 } 728 } else { 729 switch (Op) { 730 default: 731 llvm_unreachable("Unhandled atomic op"); 732 733 case AtomicRMWInst::Add: 734 case AtomicRMWInst::Sub: { 735 // The new value we will be contributing to the atomic operation is the 736 // old value times the number of active lanes. 737 Value *const Ctpop = B.CreateIntCast( 738 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 739 NewV = buildMul(B, V, Ctpop); 740 break; 741 } 742 743 case AtomicRMWInst::And: 744 case AtomicRMWInst::Or: 745 case AtomicRMWInst::Max: 746 case AtomicRMWInst::Min: 747 case AtomicRMWInst::UMax: 748 case AtomicRMWInst::UMin: 749 // These operations with a uniform value are idempotent: doing the atomic 750 // operation multiple times has the same effect as doing it once. 751 NewV = V; 752 break; 753 754 case AtomicRMWInst::Xor: 755 // The new value we will be contributing to the atomic operation is the 756 // old value times the parity of the number of active lanes. 757 Value *const Ctpop = B.CreateIntCast( 758 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 759 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 760 break; 761 } 762 } 763 764 // We only want a single lane to enter our new control flow, and we do this 765 // by checking if there are any active lanes below us. Only one lane will 766 // have 0 active lanes below us, so that will be the only one to progress. 767 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 768 769 // Store I's original basic block before we split the block. 770 BasicBlock *const EntryBB = I.getParent(); 771 772 // We need to introduce some new control flow to force a single lane to be 773 // active. We do this by splitting I's basic block at I, and introducing the 774 // new block such that: 775 // entry --> single_lane -\ 776 // \------------------> exit 777 Instruction *const SingleLaneTerminator = 778 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 779 780 // At this point, we have split the I's block to allow one lane in wavefront 781 // to update the precomputed reduced value. Also, completed the codegen for 782 // new control flow i.e. iterative loop which perform reduction and scan using 783 // ComputeLoop and ComputeEnd. 784 // For the new control flow, we need to move branch instruction i.e. 785 // terminator created during SplitBlockAndInsertIfThen from I's block to 786 // ComputeEnd block. We also need to set up predecessor to next block when 787 // single lane done updating the final reduced value. 788 BasicBlock *Predecessor = nullptr; 789 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 790 // Move terminator from I's block to ComputeEnd block. 791 Instruction *Terminator = EntryBB->getTerminator(); 792 B.SetInsertPoint(ComputeEnd); 793 Terminator->removeFromParent(); 794 B.Insert(Terminator); 795 796 // Branch to ComputeLoop Block unconditionally from the I's block for 797 // iterative approach. 798 B.SetInsertPoint(EntryBB); 799 B.CreateBr(ComputeLoop); 800 801 // Update the dominator tree for new control flow. 802 DTU.applyUpdates( 803 {{DominatorTree::Insert, EntryBB, ComputeLoop}, 804 {DominatorTree::Insert, ComputeLoop, ComputeEnd}, 805 {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}}); 806 807 Predecessor = ComputeEnd; 808 } else { 809 Predecessor = EntryBB; 810 } 811 // Move the IR builder into single_lane next. 812 B.SetInsertPoint(SingleLaneTerminator); 813 814 // Clone the original atomic operation into single lane, replacing the 815 // original value with our newly created one. 816 Instruction *const NewI = I.clone(); 817 B.Insert(NewI); 818 NewI->setOperand(ValIdx, NewV); 819 820 // Move the IR builder into exit next, and start inserting just before the 821 // original instruction. 822 B.SetInsertPoint(&I); 823 824 if (NeedResult) { 825 // Create a PHI node to get our new atomic result into the exit block. 826 PHINode *const PHI = B.CreatePHI(Ty, 2); 827 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 828 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 829 830 // We need to broadcast the value who was the lowest active lane (the first 831 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 832 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 833 Value *BroadcastI = nullptr; 834 835 if (TyBitWidth == 64) { 836 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 837 Value *const ExtractHi = 838 B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 839 CallInst *const ReadFirstLaneLo = 840 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 841 CallInst *const ReadFirstLaneHi = 842 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 843 Value *const PartialInsert = B.CreateInsertElement( 844 PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 845 Value *const Insert = 846 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 847 BroadcastI = B.CreateBitCast(Insert, Ty); 848 } else if (TyBitWidth == 32) { 849 850 BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 851 } else { 852 llvm_unreachable("Unhandled atomic bit width"); 853 } 854 855 // Now that we have the result of our single atomic operation, we need to 856 // get our individual lane's slice into the result. We use the lane offset 857 // we previously calculated combined with the atomic result value we got 858 // from the first lane, to get our lane's index into the atomic result. 859 Value *LaneOffset = nullptr; 860 if (ValDivergent) { 861 if (ScanImpl == ScanOptions::DPP) { 862 LaneOffset = 863 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 864 } else { 865 LaneOffset = ExclScan; 866 } 867 } else { 868 switch (Op) { 869 default: 870 llvm_unreachable("Unhandled atomic op"); 871 case AtomicRMWInst::Add: 872 case AtomicRMWInst::Sub: 873 LaneOffset = buildMul(B, V, Mbcnt); 874 break; 875 case AtomicRMWInst::And: 876 case AtomicRMWInst::Or: 877 case AtomicRMWInst::Max: 878 case AtomicRMWInst::Min: 879 case AtomicRMWInst::UMax: 880 case AtomicRMWInst::UMin: 881 LaneOffset = B.CreateSelect(Cond, Identity, V); 882 break; 883 case AtomicRMWInst::Xor: 884 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 885 break; 886 } 887 } 888 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 889 890 if (IsPixelShader) { 891 // Need a final PHI to reconverge to above the helper lane branch mask. 892 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 893 894 PHINode *const PHI = B.CreatePHI(Ty, 2); 895 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 896 PHI->addIncoming(Result, I.getParent()); 897 I.replaceAllUsesWith(PHI); 898 } else { 899 // Replace the original atomic instruction with the new one. 900 I.replaceAllUsesWith(Result); 901 } 902 } 903 904 // And delete the original. 905 I.eraseFromParent(); 906 } 907 908 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 909 "AMDGPU atomic optimizations", false, false) 910 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 911 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 912 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 913 "AMDGPU atomic optimizations", false, false) 914 915 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 916 return new AMDGPUAtomicOptimizer(ScanStrategy); 917 } 918