1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 Function &F; 70 SmallVector<ReplacementInfo, 8> ToReplace; 71 const UniformityInfo &UA; 72 const DataLayout &DL; 73 DomTreeUpdater &DTU; 74 const GCNSubtarget &ST; 75 bool IsPixelShader; 76 ScanOptions ScanImpl; 77 78 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 79 Value *const Identity) const; 80 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 81 Value *const Identity) const; 82 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 83 84 std::pair<Value *, Value *> 85 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 86 Value *const Identity, Value *V, Instruction &I, 87 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 88 89 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 90 bool ValDivergent) const; 91 92 public: 93 AMDGPUAtomicOptimizerImpl() = delete; 94 95 AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA, 96 DomTreeUpdater &DTU, const GCNSubtarget &ST, 97 ScanOptions ScanImpl) 98 : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST), 99 IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS), 100 ScanImpl(ScanImpl) {} 101 102 bool run(); 103 104 void visitAtomicRMWInst(AtomicRMWInst &I); 105 void visitIntrinsicInst(IntrinsicInst &I); 106 }; 107 108 } // namespace 109 110 char AMDGPUAtomicOptimizer::ID = 0; 111 112 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 113 114 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 115 if (skipFunction(F)) { 116 return false; 117 } 118 119 const UniformityInfo &UA = 120 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 121 122 DominatorTreeWrapperPass *DTW = 123 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 124 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 125 DomTreeUpdater::UpdateStrategy::Lazy); 126 127 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 128 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 129 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 130 131 return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run(); 132 } 133 134 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 135 FunctionAnalysisManager &AM) { 136 const auto &UA = AM.getResult<UniformityInfoAnalysis>(F); 137 138 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 139 DomTreeUpdater::UpdateStrategy::Lazy); 140 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 141 142 bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run(); 143 144 if (!IsChanged) { 145 return PreservedAnalyses::all(); 146 } 147 148 PreservedAnalyses PA; 149 PA.preserve<DominatorTreeAnalysis>(); 150 return PA; 151 } 152 153 bool AMDGPUAtomicOptimizerImpl::run() { 154 155 // Scan option None disables the Pass 156 if (ScanImpl == ScanOptions::None) { 157 return false; 158 } 159 160 visit(F); 161 162 const bool Changed = !ToReplace.empty(); 163 164 for (ReplacementInfo &Info : ToReplace) { 165 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 166 } 167 168 ToReplace.clear(); 169 170 return Changed; 171 } 172 173 static bool isLegalCrossLaneType(Type *Ty) { 174 switch (Ty->getTypeID()) { 175 case Type::FloatTyID: 176 case Type::DoubleTyID: 177 return true; 178 case Type::IntegerTyID: { 179 unsigned Size = Ty->getIntegerBitWidth(); 180 return (Size == 32 || Size == 64); 181 } 182 default: 183 return false; 184 } 185 } 186 187 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 188 // Early exit for unhandled address space atomic instructions. 189 switch (I.getPointerAddressSpace()) { 190 default: 191 return; 192 case AMDGPUAS::GLOBAL_ADDRESS: 193 case AMDGPUAS::LOCAL_ADDRESS: 194 break; 195 } 196 197 AtomicRMWInst::BinOp Op = I.getOperation(); 198 199 switch (Op) { 200 default: 201 return; 202 case AtomicRMWInst::Add: 203 case AtomicRMWInst::Sub: 204 case AtomicRMWInst::And: 205 case AtomicRMWInst::Or: 206 case AtomicRMWInst::Xor: 207 case AtomicRMWInst::Max: 208 case AtomicRMWInst::Min: 209 case AtomicRMWInst::UMax: 210 case AtomicRMWInst::UMin: 211 case AtomicRMWInst::FAdd: 212 case AtomicRMWInst::FSub: 213 case AtomicRMWInst::FMax: 214 case AtomicRMWInst::FMin: 215 break; 216 } 217 218 // Only 32 and 64 bit floating point atomic ops are supported. 219 if (AtomicRMWInst::isFPOperation(Op) && 220 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 221 return; 222 } 223 224 const unsigned PtrIdx = 0; 225 const unsigned ValIdx = 1; 226 227 // If the pointer operand is divergent, then each lane is doing an atomic 228 // operation on a different address, and we cannot optimize that. 229 if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) { 230 return; 231 } 232 233 bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); 234 235 // If the value operand is divergent, each lane is contributing a different 236 // value to the atomic calculation. We can only optimize divergent values if 237 // we have DPP available on our subtarget (for DPP strategy), and the atomic 238 // operation is 32 or 64 bits. 239 if (ValDivergent) { 240 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP()) 241 return; 242 243 if (!isLegalCrossLaneType(I.getType())) 244 return; 245 } 246 247 // If we get here, we can optimize the atomic using a single wavefront-wide 248 // atomic operation to do the calculation for the entire wavefront, so 249 // remember the instruction so we can come back to it. 250 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 251 252 ToReplace.push_back(Info); 253 } 254 255 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 256 AtomicRMWInst::BinOp Op; 257 258 switch (I.getIntrinsicID()) { 259 default: 260 return; 261 case Intrinsic::amdgcn_struct_buffer_atomic_add: 262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 263 case Intrinsic::amdgcn_raw_buffer_atomic_add: 264 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 265 Op = AtomicRMWInst::Add; 266 break; 267 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 268 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 269 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 270 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 271 Op = AtomicRMWInst::Sub; 272 break; 273 case Intrinsic::amdgcn_struct_buffer_atomic_and: 274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 275 case Intrinsic::amdgcn_raw_buffer_atomic_and: 276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 277 Op = AtomicRMWInst::And; 278 break; 279 case Intrinsic::amdgcn_struct_buffer_atomic_or: 280 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 281 case Intrinsic::amdgcn_raw_buffer_atomic_or: 282 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 283 Op = AtomicRMWInst::Or; 284 break; 285 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 286 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 287 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 289 Op = AtomicRMWInst::Xor; 290 break; 291 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 292 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 293 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 294 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 295 Op = AtomicRMWInst::Min; 296 break; 297 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 298 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 299 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 301 Op = AtomicRMWInst::UMin; 302 break; 303 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 304 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 305 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 306 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 307 Op = AtomicRMWInst::Max; 308 break; 309 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 310 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 311 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 312 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 313 Op = AtomicRMWInst::UMax; 314 break; 315 } 316 317 const unsigned ValIdx = 0; 318 319 const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); 320 321 // If the value operand is divergent, each lane is contributing a different 322 // value to the atomic calculation. We can only optimize divergent values if 323 // we have DPP available on our subtarget (for DPP strategy), and the atomic 324 // operation is 32 or 64 bits. 325 if (ValDivergent) { 326 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP()) 327 return; 328 329 if (!isLegalCrossLaneType(I.getType())) 330 return; 331 } 332 333 // If any of the other arguments to the intrinsic are divergent, we can't 334 // optimize the operation. 335 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 336 if (UA.isDivergentUse(I.getOperandUse(Idx))) { 337 return; 338 } 339 } 340 341 // If we get here, we can optimize the atomic using a single wavefront-wide 342 // atomic operation to do the calculation for the entire wavefront, so 343 // remember the instruction so we can come back to it. 344 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 345 346 ToReplace.push_back(Info); 347 } 348 349 // Use the builder to create the non-atomic counterpart of the specified 350 // atomicrmw binary op. 351 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 352 Value *LHS, Value *RHS) { 353 CmpInst::Predicate Pred; 354 355 switch (Op) { 356 default: 357 llvm_unreachable("Unhandled atomic op"); 358 case AtomicRMWInst::Add: 359 return B.CreateBinOp(Instruction::Add, LHS, RHS); 360 case AtomicRMWInst::FAdd: 361 return B.CreateFAdd(LHS, RHS); 362 case AtomicRMWInst::Sub: 363 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 364 case AtomicRMWInst::FSub: 365 return B.CreateFSub(LHS, RHS); 366 case AtomicRMWInst::And: 367 return B.CreateBinOp(Instruction::And, LHS, RHS); 368 case AtomicRMWInst::Or: 369 return B.CreateBinOp(Instruction::Or, LHS, RHS); 370 case AtomicRMWInst::Xor: 371 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 372 373 case AtomicRMWInst::Max: 374 Pred = CmpInst::ICMP_SGT; 375 break; 376 case AtomicRMWInst::Min: 377 Pred = CmpInst::ICMP_SLT; 378 break; 379 case AtomicRMWInst::UMax: 380 Pred = CmpInst::ICMP_UGT; 381 break; 382 case AtomicRMWInst::UMin: 383 Pred = CmpInst::ICMP_ULT; 384 break; 385 case AtomicRMWInst::FMax: 386 return B.CreateMaxNum(LHS, RHS); 387 case AtomicRMWInst::FMin: 388 return B.CreateMinNum(LHS, RHS); 389 } 390 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 391 return B.CreateSelect(Cond, LHS, RHS); 392 } 393 394 // Use the builder to create a reduction of V across the wavefront, with all 395 // lanes active, returning the same result in all lanes. 396 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 397 AtomicRMWInst::BinOp Op, 398 Value *V, 399 Value *const Identity) const { 400 Type *AtomicTy = V->getType(); 401 Module *M = B.GetInsertBlock()->getModule(); 402 403 // Reduce within each row of 16 lanes. 404 for (unsigned Idx = 0; Idx < 4; Idx++) { 405 V = buildNonAtomicBinOp( 406 B, Op, V, 407 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy, 408 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 409 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 410 } 411 412 // Reduce within each pair of rows (i.e. 32 lanes). 413 assert(ST.hasPermLaneX16()); 414 Value *Permlanex16Call = 415 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, 416 {PoisonValue::get(AtomicTy), V, B.getInt32(0), 417 B.getInt32(0), B.getFalse(), B.getFalse()}); 418 V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); 419 if (ST.isWave32()) { 420 return V; 421 } 422 423 if (ST.hasPermLane64()) { 424 // Reduce across the upper and lower 32 lanes. 425 Value *Permlane64Call = 426 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V); 427 return buildNonAtomicBinOp(B, Op, V, Permlane64Call); 428 } 429 430 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 431 // combine them with a scalar operation. 432 Function *ReadLane = Intrinsic::getOrInsertDeclaration( 433 M, Intrinsic::amdgcn_readlane, AtomicTy); 434 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 435 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 436 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 437 } 438 439 // Use the builder to create an inclusive scan of V across the wavefront, with 440 // all lanes active. 441 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 442 AtomicRMWInst::BinOp Op, Value *V, 443 Value *Identity) const { 444 Type *AtomicTy = V->getType(); 445 Module *M = B.GetInsertBlock()->getModule(); 446 Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( 447 M, Intrinsic::amdgcn_update_dpp, AtomicTy); 448 449 for (unsigned Idx = 0; Idx < 4; Idx++) { 450 V = buildNonAtomicBinOp( 451 B, Op, V, 452 B.CreateCall(UpdateDPP, 453 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 454 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 455 } 456 if (ST.hasDPPBroadcasts()) { 457 // GFX9 has DPP row broadcast operations. 458 V = buildNonAtomicBinOp( 459 B, Op, V, 460 B.CreateCall(UpdateDPP, 461 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 462 B.getInt32(0xf), B.getFalse()})); 463 V = buildNonAtomicBinOp( 464 B, Op, V, 465 B.CreateCall(UpdateDPP, 466 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 467 B.getInt32(0xf), B.getFalse()})); 468 } else { 469 // On GFX10 all DPP operations are confined to a single row. To get cross- 470 // row operations we have to use permlane or readlane. 471 472 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 473 // 48..63). 474 assert(ST.hasPermLaneX16()); 475 Value *PermX = 476 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, 477 {PoisonValue::get(AtomicTy), V, B.getInt32(-1), 478 B.getInt32(-1), B.getFalse(), B.getFalse()}); 479 480 Value *UpdateDPPCall = B.CreateCall( 481 UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 482 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); 483 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 484 485 if (!ST.isWave32()) { 486 // Combine lane 31 into lanes 32..63. 487 Value *const Lane31 = B.CreateIntrinsic( 488 AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 489 490 Value *UpdateDPPCall = B.CreateCall( 491 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 492 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 493 494 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 495 } 496 } 497 return V; 498 } 499 500 // Use the builder to create a shift right of V across the wavefront, with all 501 // lanes active, to turn an inclusive scan into an exclusive scan. 502 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 503 Value *Identity) const { 504 Type *AtomicTy = V->getType(); 505 Module *M = B.GetInsertBlock()->getModule(); 506 Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( 507 M, Intrinsic::amdgcn_update_dpp, AtomicTy); 508 if (ST.hasDPPWavefrontShifts()) { 509 // GFX9 has DPP wavefront shift operations. 510 V = B.CreateCall(UpdateDPP, 511 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 512 B.getInt32(0xf), B.getFalse()}); 513 } else { 514 Function *ReadLane = Intrinsic::getOrInsertDeclaration( 515 M, Intrinsic::amdgcn_readlane, AtomicTy); 516 Function *WriteLane = Intrinsic::getOrInsertDeclaration( 517 M, Intrinsic::amdgcn_writelane, AtomicTy); 518 519 // On GFX10 all DPP operations are confined to a single row. To get cross- 520 // row operations we have to use permlane or readlane. 521 Value *Old = V; 522 V = B.CreateCall(UpdateDPP, 523 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 524 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 525 526 // Copy the old lane 15 to the new lane 16. 527 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 528 B.getInt32(16), V}); 529 530 if (!ST.isWave32()) { 531 // Copy the old lane 31 to the new lane 32. 532 V = B.CreateCall( 533 WriteLane, 534 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 535 536 // Copy the old lane 47 to the new lane 48. 537 V = B.CreateCall( 538 WriteLane, 539 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 540 } 541 } 542 543 return V; 544 } 545 546 // Use the builder to create an exclusive scan and compute the final reduced 547 // value using an iterative approach. This provides an alternative 548 // implementation to DPP which uses WMM for scan computations. This API iterate 549 // over active lanes to read, compute and update the value using 550 // readlane and writelane intrinsics. 551 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 552 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 553 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 554 auto *Ty = I.getType(); 555 auto *WaveTy = B.getIntNTy(ST.getWavefrontSize()); 556 auto *EntryBB = I.getParent(); 557 auto NeedResult = !I.use_empty(); 558 559 auto *Ballot = 560 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 561 562 // Start inserting instructions for ComputeLoop block 563 B.SetInsertPoint(ComputeLoop); 564 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 565 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 566 Accumulator->addIncoming(Identity, EntryBB); 567 PHINode *OldValuePhi = nullptr; 568 if (NeedResult) { 569 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 570 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 571 } 572 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 573 ActiveBits->addIncoming(Ballot, EntryBB); 574 575 // Use llvm.cttz intrinsic to find the lowest remaining active lane. 576 auto *FF1 = 577 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 578 579 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); 580 581 // Get the value required for atomic operation 582 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 583 {V, LaneIdxInt}); 584 585 // Perform writelane if intermediate scan results are required later in the 586 // kernel computations 587 Value *OldValue = nullptr; 588 if (NeedResult) { 589 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, 590 {Accumulator, LaneIdxInt, OldValuePhi}); 591 OldValuePhi->addIncoming(OldValue, ComputeLoop); 592 } 593 594 // Accumulate the results 595 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 596 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 597 598 // Set bit to zero of current active lane so that for next iteration llvm.cttz 599 // return the next active lane 600 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 601 602 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 603 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 604 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 605 606 // Branch out of the loop when all lanes are processed. 607 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 608 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 609 610 B.SetInsertPoint(ComputeEnd); 611 612 return {OldValue, NewAccumulator}; 613 } 614 615 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 616 AtomicRMWInst::BinOp Op) { 617 LLVMContext &C = Ty->getContext(); 618 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 619 switch (Op) { 620 default: 621 llvm_unreachable("Unhandled atomic op"); 622 case AtomicRMWInst::Add: 623 case AtomicRMWInst::Sub: 624 case AtomicRMWInst::Or: 625 case AtomicRMWInst::Xor: 626 case AtomicRMWInst::UMax: 627 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 628 case AtomicRMWInst::And: 629 case AtomicRMWInst::UMin: 630 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 631 case AtomicRMWInst::Max: 632 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 633 case AtomicRMWInst::Min: 634 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 635 case AtomicRMWInst::FAdd: 636 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 637 case AtomicRMWInst::FSub: 638 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 639 case AtomicRMWInst::FMin: 640 case AtomicRMWInst::FMax: 641 // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the 642 // closest thing they have to an identity, but it still does not preserve 643 // the difference between quiet and signaling NaNs or NaNs with different 644 // payloads. 645 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); 646 } 647 } 648 649 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 650 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 651 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 652 } 653 654 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 655 AtomicRMWInst::BinOp Op, 656 unsigned ValIdx, 657 bool ValDivergent) const { 658 // Start building just before the instruction. 659 IRBuilder<> B(&I); 660 661 if (AtomicRMWInst::isFPOperation(Op)) { 662 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 663 } 664 665 // If we are in a pixel shader, because of how we have to mask out helper 666 // lane invocations, we need to record the entry and exit BB's. 667 BasicBlock *PixelEntryBB = nullptr; 668 BasicBlock *PixelExitBB = nullptr; 669 670 // If we're optimizing an atomic within a pixel shader, we need to wrap the 671 // entire atomic operation in a helper-lane check. We do not want any helper 672 // lanes that are around only for the purposes of derivatives to take part 673 // in any cross-lane communication, and we use a branch on whether the lane is 674 // live to do this. 675 if (IsPixelShader) { 676 // Record I's original position as the entry block. 677 PixelEntryBB = I.getParent(); 678 679 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 680 Instruction *const NonHelperTerminator = 681 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 682 683 // Record I's new position as the exit block. 684 PixelExitBB = I.getParent(); 685 686 I.moveBefore(NonHelperTerminator->getIterator()); 687 B.SetInsertPoint(&I); 688 } 689 690 Type *const Ty = I.getType(); 691 Type *Int32Ty = B.getInt32Ty(); 692 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 693 [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty); 694 695 // This is the value in the atomic operation we need to combine in order to 696 // reduce the number of atomic operations. 697 Value *V = I.getOperand(ValIdx); 698 699 // We need to know how many lanes are active within the wavefront, and we do 700 // this by doing a ballot of active lanes. 701 Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize()); 702 CallInst *const Ballot = 703 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 704 705 // We need to know how many lanes are active within the wavefront that are 706 // below us. If we counted each lane linearly starting from 0, a lane is 707 // below us only if its associated index was less than ours. We do this by 708 // using the mbcnt intrinsic. 709 Value *Mbcnt; 710 if (ST.isWave32()) { 711 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 712 {Ballot, B.getInt32(0)}); 713 } else { 714 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 715 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 716 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 717 {ExtractLo, B.getInt32(0)}); 718 Mbcnt = 719 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 720 } 721 722 Function *F = I.getFunction(); 723 LLVMContext &C = F->getContext(); 724 725 // For atomic sub, perform scan with add operation and allow one lane to 726 // subtract the reduced value later. 727 AtomicRMWInst::BinOp ScanOp = Op; 728 if (Op == AtomicRMWInst::Sub) { 729 ScanOp = AtomicRMWInst::Add; 730 } else if (Op == AtomicRMWInst::FSub) { 731 ScanOp = AtomicRMWInst::FAdd; 732 } 733 Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 734 735 Value *ExclScan = nullptr; 736 Value *NewV = nullptr; 737 738 const bool NeedResult = !I.use_empty(); 739 740 BasicBlock *ComputeLoop = nullptr; 741 BasicBlock *ComputeEnd = nullptr; 742 // If we have a divergent value in each lane, we need to combine the value 743 // using DPP. 744 if (ValDivergent) { 745 if (ScanImpl == ScanOptions::DPP) { 746 // First we need to set all inactive invocations to the identity value, so 747 // that they can correctly contribute to the final result. 748 NewV = 749 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 750 if (!NeedResult && ST.hasPermLaneX16()) { 751 // On GFX10 the permlanex16 instruction helps us build a reduction 752 // without too many readlanes and writelanes, which are generally bad 753 // for performance. 754 NewV = buildReduction(B, ScanOp, NewV, Identity); 755 } else { 756 NewV = buildScan(B, ScanOp, NewV, Identity); 757 if (NeedResult) 758 ExclScan = buildShiftRight(B, NewV, Identity); 759 // Read the value from the last lane, which has accumulated the values 760 // of each active lane in the wavefront. This will be our new value 761 // which we will provide to the atomic operation. 762 Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1); 763 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, 764 {NewV, LastLaneIdx}); 765 } 766 // Finally mark the readlanes in the WWM section. 767 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 768 } else if (ScanImpl == ScanOptions::Iterative) { 769 // Alternative implementation for scan 770 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 771 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 772 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 773 ComputeLoop, ComputeEnd); 774 } else { 775 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 776 } 777 } else { 778 switch (Op) { 779 default: 780 llvm_unreachable("Unhandled atomic op"); 781 782 case AtomicRMWInst::Add: 783 case AtomicRMWInst::Sub: { 784 // The new value we will be contributing to the atomic operation is the 785 // old value times the number of active lanes. 786 Value *const Ctpop = B.CreateIntCast( 787 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 788 NewV = buildMul(B, V, Ctpop); 789 break; 790 } 791 case AtomicRMWInst::FAdd: 792 case AtomicRMWInst::FSub: { 793 Value *const Ctpop = B.CreateIntCast( 794 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 795 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 796 NewV = B.CreateFMul(V, CtpopFP); 797 break; 798 } 799 case AtomicRMWInst::And: 800 case AtomicRMWInst::Or: 801 case AtomicRMWInst::Max: 802 case AtomicRMWInst::Min: 803 case AtomicRMWInst::UMax: 804 case AtomicRMWInst::UMin: 805 case AtomicRMWInst::FMin: 806 case AtomicRMWInst::FMax: 807 // These operations with a uniform value are idempotent: doing the atomic 808 // operation multiple times has the same effect as doing it once. 809 NewV = V; 810 break; 811 812 case AtomicRMWInst::Xor: 813 // The new value we will be contributing to the atomic operation is the 814 // old value times the parity of the number of active lanes. 815 Value *const Ctpop = B.CreateIntCast( 816 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 817 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 818 break; 819 } 820 } 821 822 // We only want a single lane to enter our new control flow, and we do this 823 // by checking if there are any active lanes below us. Only one lane will 824 // have 0 active lanes below us, so that will be the only one to progress. 825 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 826 827 // Store I's original basic block before we split the block. 828 BasicBlock *const OriginalBB = I.getParent(); 829 830 // We need to introduce some new control flow to force a single lane to be 831 // active. We do this by splitting I's basic block at I, and introducing the 832 // new block such that: 833 // entry --> single_lane -\ 834 // \------------------> exit 835 Instruction *const SingleLaneTerminator = 836 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 837 838 // At this point, we have split the I's block to allow one lane in wavefront 839 // to update the precomputed reduced value. Also, completed the codegen for 840 // new control flow i.e. iterative loop which perform reduction and scan using 841 // ComputeLoop and ComputeEnd. 842 // For the new control flow, we need to move branch instruction i.e. 843 // terminator created during SplitBlockAndInsertIfThen from I's block to 844 // ComputeEnd block. We also need to set up predecessor to next block when 845 // single lane done updating the final reduced value. 846 BasicBlock *Predecessor = nullptr; 847 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 848 // Move terminator from I's block to ComputeEnd block. 849 // 850 // OriginalBB is known to have a branch as terminator because 851 // SplitBlockAndInsertIfThen will have inserted one. 852 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 853 B.SetInsertPoint(ComputeEnd); 854 Terminator->removeFromParent(); 855 B.Insert(Terminator); 856 857 // Branch to ComputeLoop Block unconditionally from the I's block for 858 // iterative approach. 859 B.SetInsertPoint(OriginalBB); 860 B.CreateBr(ComputeLoop); 861 862 // Update the dominator tree for new control flow. 863 SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 864 {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 865 {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 866 867 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 868 // the DT edges as well. 869 for (auto *Succ : Terminator->successors()) { 870 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 871 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 872 } 873 874 DTU.applyUpdates(DomTreeUpdates); 875 876 Predecessor = ComputeEnd; 877 } else { 878 Predecessor = OriginalBB; 879 } 880 // Move the IR builder into single_lane next. 881 B.SetInsertPoint(SingleLaneTerminator); 882 883 // Clone the original atomic operation into single lane, replacing the 884 // original value with our newly created one. 885 Instruction *const NewI = I.clone(); 886 B.Insert(NewI); 887 NewI->setOperand(ValIdx, NewV); 888 889 // Move the IR builder into exit next, and start inserting just before the 890 // original instruction. 891 B.SetInsertPoint(&I); 892 893 if (NeedResult) { 894 // Create a PHI node to get our new atomic result into the exit block. 895 PHINode *const PHI = B.CreatePHI(Ty, 2); 896 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 897 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 898 899 // We need to broadcast the value who was the lowest active lane (the first 900 // lane) to all other lanes in the wavefront. 901 Value *BroadcastI = nullptr; 902 BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); 903 904 // Now that we have the result of our single atomic operation, we need to 905 // get our individual lane's slice into the result. We use the lane offset 906 // we previously calculated combined with the atomic result value we got 907 // from the first lane, to get our lane's index into the atomic result. 908 Value *LaneOffset = nullptr; 909 if (ValDivergent) { 910 if (ScanImpl == ScanOptions::DPP) { 911 LaneOffset = 912 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 913 } else if (ScanImpl == ScanOptions::Iterative) { 914 LaneOffset = ExclScan; 915 } else { 916 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 917 } 918 } else { 919 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 920 : B.CreateIntCast(Mbcnt, Ty, false); 921 switch (Op) { 922 default: 923 llvm_unreachable("Unhandled atomic op"); 924 case AtomicRMWInst::Add: 925 case AtomicRMWInst::Sub: 926 LaneOffset = buildMul(B, V, Mbcnt); 927 break; 928 case AtomicRMWInst::And: 929 case AtomicRMWInst::Or: 930 case AtomicRMWInst::Max: 931 case AtomicRMWInst::Min: 932 case AtomicRMWInst::UMax: 933 case AtomicRMWInst::UMin: 934 case AtomicRMWInst::FMin: 935 case AtomicRMWInst::FMax: 936 LaneOffset = B.CreateSelect(Cond, Identity, V); 937 break; 938 case AtomicRMWInst::Xor: 939 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 940 break; 941 case AtomicRMWInst::FAdd: 942 case AtomicRMWInst::FSub: { 943 LaneOffset = B.CreateFMul(V, Mbcnt); 944 break; 945 } 946 } 947 } 948 Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 949 if (isAtomicFloatingPointTy) { 950 // For fadd/fsub the first active lane of LaneOffset should be the 951 // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated 952 // is V * +0.0 which might have the wrong sign or might be nan (if V is 953 // inf or nan). 954 // 955 // For all floating point ops if the in-memory value was a nan then the 956 // binop we just built might have quieted it or changed its payload. 957 // 958 // Correct all these problems by using BroadcastI as the result in the 959 // first active lane. 960 Result = B.CreateSelect(Cond, BroadcastI, Result); 961 } 962 963 if (IsPixelShader) { 964 // Need a final PHI to reconverge to above the helper lane branch mask. 965 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 966 967 PHINode *const PHI = B.CreatePHI(Ty, 2); 968 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 969 PHI->addIncoming(Result, I.getParent()); 970 I.replaceAllUsesWith(PHI); 971 } else { 972 // Replace the original atomic instruction with the new one. 973 I.replaceAllUsesWith(Result); 974 } 975 } 976 977 // And delete the original. 978 I.eraseFromParent(); 979 } 980 981 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 982 "AMDGPU atomic optimizations", false, false) 983 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 984 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 985 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 986 "AMDGPU atomic optimizations", false, false) 987 988 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 989 return new AMDGPUAtomicOptimizer(ScanStrategy); 990 } 991