1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "GCNSubtarget.h" 18 #include "llvm/Analysis/UniformityAnalysis.h" 19 #include "llvm/CodeGen/TargetPassConfig.h" 20 #include "llvm/IR/IRBuilder.h" 21 #include "llvm/IR/InstVisitor.h" 22 #include "llvm/IR/IntrinsicsAMDGPU.h" 23 #include "llvm/InitializePasses.h" 24 #include "llvm/Target/TargetMachine.h" 25 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 26 27 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 28 29 using namespace llvm; 30 using namespace llvm::AMDGPU; 31 32 namespace { 33 34 struct ReplacementInfo { 35 Instruction *I; 36 AtomicRMWInst::BinOp Op; 37 unsigned ValIdx; 38 bool ValDivergent; 39 }; 40 41 class AMDGPUAtomicOptimizer : public FunctionPass { 42 public: 43 static char ID; 44 45 AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 46 47 bool runOnFunction(Function &F) override; 48 49 void getAnalysisUsage(AnalysisUsage &AU) const override { 50 AU.addPreserved<DominatorTreeWrapperPass>(); 51 AU.addRequired<UniformityInfoWrapperPass>(); 52 AU.addRequired<TargetPassConfig>(); 53 } 54 }; 55 56 class AMDGPUAtomicOptimizerImpl 57 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 58 private: 59 SmallVector<ReplacementInfo, 8> ToReplace; 60 const UniformityInfo *UA; 61 const DataLayout *DL; 62 DominatorTree *DT; 63 const GCNSubtarget *ST; 64 bool IsPixelShader; 65 66 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 67 Value *const Identity) const; 68 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 69 Value *const Identity) const; 70 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 71 72 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 73 bool ValDivergent) const; 74 75 public: 76 AMDGPUAtomicOptimizerImpl() = delete; 77 78 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 79 DominatorTree *DT, const GCNSubtarget *ST, 80 bool IsPixelShader) 81 : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {} 82 83 bool run(Function &F); 84 85 void visitAtomicRMWInst(AtomicRMWInst &I); 86 void visitIntrinsicInst(IntrinsicInst &I); 87 }; 88 89 } // namespace 90 91 char AMDGPUAtomicOptimizer::ID = 0; 92 93 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 94 95 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 96 if (skipFunction(F)) { 97 return false; 98 } 99 100 const UniformityInfo *UA = 101 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 102 const DataLayout *DL = &F.getParent()->getDataLayout(); 103 104 DominatorTreeWrapperPass *const DTW = 105 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 106 DominatorTree *DT = DTW ? &DTW->getDomTree() : nullptr; 107 108 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 109 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 110 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 111 112 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 113 114 return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F); 115 } 116 117 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 118 FunctionAnalysisManager &AM) { 119 120 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 121 const DataLayout *DL = &F.getParent()->getDataLayout(); 122 123 DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); 124 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 125 126 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 127 128 return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F) 129 ? PreservedAnalyses::none() 130 : PreservedAnalyses::all(); 131 } 132 133 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 134 135 visit(F); 136 137 const bool Changed = !ToReplace.empty(); 138 139 for (ReplacementInfo &Info : ToReplace) { 140 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 141 } 142 143 ToReplace.clear(); 144 145 return Changed; 146 } 147 148 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 149 // Early exit for unhandled address space atomic instructions. 150 switch (I.getPointerAddressSpace()) { 151 default: 152 return; 153 case AMDGPUAS::GLOBAL_ADDRESS: 154 case AMDGPUAS::LOCAL_ADDRESS: 155 break; 156 } 157 158 AtomicRMWInst::BinOp Op = I.getOperation(); 159 160 switch (Op) { 161 default: 162 return; 163 case AtomicRMWInst::Add: 164 case AtomicRMWInst::Sub: 165 case AtomicRMWInst::And: 166 case AtomicRMWInst::Or: 167 case AtomicRMWInst::Xor: 168 case AtomicRMWInst::Max: 169 case AtomicRMWInst::Min: 170 case AtomicRMWInst::UMax: 171 case AtomicRMWInst::UMin: 172 break; 173 } 174 175 const unsigned PtrIdx = 0; 176 const unsigned ValIdx = 1; 177 178 // If the pointer operand is divergent, then each lane is doing an atomic 179 // operation on a different address, and we cannot optimize that. 180 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 181 return; 182 } 183 184 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 185 186 // If the value operand is divergent, each lane is contributing a different 187 // value to the atomic calculation. We can only optimize divergent values if 188 // we have DPP available on our subtarget, and the atomic operation is 32 189 // bits. 190 if (ValDivergent && 191 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 192 return; 193 } 194 195 // If we get here, we can optimize the atomic using a single wavefront-wide 196 // atomic operation to do the calculation for the entire wavefront, so 197 // remember the instruction so we can come back to it. 198 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 199 200 ToReplace.push_back(Info); 201 } 202 203 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 204 AtomicRMWInst::BinOp Op; 205 206 switch (I.getIntrinsicID()) { 207 default: 208 return; 209 case Intrinsic::amdgcn_buffer_atomic_add: 210 case Intrinsic::amdgcn_struct_buffer_atomic_add: 211 case Intrinsic::amdgcn_raw_buffer_atomic_add: 212 Op = AtomicRMWInst::Add; 213 break; 214 case Intrinsic::amdgcn_buffer_atomic_sub: 215 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 216 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 217 Op = AtomicRMWInst::Sub; 218 break; 219 case Intrinsic::amdgcn_buffer_atomic_and: 220 case Intrinsic::amdgcn_struct_buffer_atomic_and: 221 case Intrinsic::amdgcn_raw_buffer_atomic_and: 222 Op = AtomicRMWInst::And; 223 break; 224 case Intrinsic::amdgcn_buffer_atomic_or: 225 case Intrinsic::amdgcn_struct_buffer_atomic_or: 226 case Intrinsic::amdgcn_raw_buffer_atomic_or: 227 Op = AtomicRMWInst::Or; 228 break; 229 case Intrinsic::amdgcn_buffer_atomic_xor: 230 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 231 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 232 Op = AtomicRMWInst::Xor; 233 break; 234 case Intrinsic::amdgcn_buffer_atomic_smin: 235 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 236 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 237 Op = AtomicRMWInst::Min; 238 break; 239 case Intrinsic::amdgcn_buffer_atomic_umin: 240 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 241 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 242 Op = AtomicRMWInst::UMin; 243 break; 244 case Intrinsic::amdgcn_buffer_atomic_smax: 245 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 246 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 247 Op = AtomicRMWInst::Max; 248 break; 249 case Intrinsic::amdgcn_buffer_atomic_umax: 250 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 251 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 252 Op = AtomicRMWInst::UMax; 253 break; 254 } 255 256 const unsigned ValIdx = 0; 257 258 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 259 260 // If the value operand is divergent, each lane is contributing a different 261 // value to the atomic calculation. We can only optimize divergent values if 262 // we have DPP available on our subtarget, and the atomic operation is 32 263 // bits. 264 if (ValDivergent && 265 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 266 return; 267 } 268 269 // If any of the other arguments to the intrinsic are divergent, we can't 270 // optimize the operation. 271 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 272 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 273 return; 274 } 275 } 276 277 // If we get here, we can optimize the atomic using a single wavefront-wide 278 // atomic operation to do the calculation for the entire wavefront, so 279 // remember the instruction so we can come back to it. 280 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 281 282 ToReplace.push_back(Info); 283 } 284 285 // Use the builder to create the non-atomic counterpart of the specified 286 // atomicrmw binary op. 287 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 288 Value *LHS, Value *RHS) { 289 CmpInst::Predicate Pred; 290 291 switch (Op) { 292 default: 293 llvm_unreachable("Unhandled atomic op"); 294 case AtomicRMWInst::Add: 295 return B.CreateBinOp(Instruction::Add, LHS, RHS); 296 case AtomicRMWInst::Sub: 297 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 298 case AtomicRMWInst::And: 299 return B.CreateBinOp(Instruction::And, LHS, RHS); 300 case AtomicRMWInst::Or: 301 return B.CreateBinOp(Instruction::Or, LHS, RHS); 302 case AtomicRMWInst::Xor: 303 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 304 305 case AtomicRMWInst::Max: 306 Pred = CmpInst::ICMP_SGT; 307 break; 308 case AtomicRMWInst::Min: 309 Pred = CmpInst::ICMP_SLT; 310 break; 311 case AtomicRMWInst::UMax: 312 Pred = CmpInst::ICMP_UGT; 313 break; 314 case AtomicRMWInst::UMin: 315 Pred = CmpInst::ICMP_ULT; 316 break; 317 } 318 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 319 return B.CreateSelect(Cond, LHS, RHS); 320 } 321 322 // Use the builder to create a reduction of V across the wavefront, with all 323 // lanes active, returning the same result in all lanes. 324 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 325 AtomicRMWInst::BinOp Op, 326 Value *V, 327 Value *const Identity) const { 328 Type *const Ty = V->getType(); 329 Module *M = B.GetInsertBlock()->getModule(); 330 Function *UpdateDPP = 331 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 332 333 // Reduce within each row of 16 lanes. 334 for (unsigned Idx = 0; Idx < 4; Idx++) { 335 V = buildNonAtomicBinOp( 336 B, Op, V, 337 B.CreateCall(UpdateDPP, 338 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 339 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 340 } 341 342 // Reduce within each pair of rows (i.e. 32 lanes). 343 assert(ST->hasPermLaneX16()); 344 V = buildNonAtomicBinOp( 345 B, Op, V, 346 B.CreateIntrinsic( 347 Intrinsic::amdgcn_permlanex16, {}, 348 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 349 350 if (ST->isWave32()) 351 return V; 352 353 if (ST->hasPermLane64()) { 354 // Reduce across the upper and lower 32 lanes. 355 return buildNonAtomicBinOp( 356 B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); 357 } 358 359 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 360 // combine them with a scalar operation. 361 Function *ReadLane = 362 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 363 Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 364 Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 365 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 366 } 367 368 // Use the builder to create an inclusive scan of V across the wavefront, with 369 // all lanes active. 370 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 371 AtomicRMWInst::BinOp Op, Value *V, 372 Value *const Identity) const { 373 Type *const Ty = V->getType(); 374 Module *M = B.GetInsertBlock()->getModule(); 375 Function *UpdateDPP = 376 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 377 378 for (unsigned Idx = 0; Idx < 4; Idx++) { 379 V = buildNonAtomicBinOp( 380 B, Op, V, 381 B.CreateCall(UpdateDPP, 382 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 383 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 384 } 385 if (ST->hasDPPBroadcasts()) { 386 // GFX9 has DPP row broadcast operations. 387 V = buildNonAtomicBinOp( 388 B, Op, V, 389 B.CreateCall(UpdateDPP, 390 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 391 B.getInt32(0xf), B.getFalse()})); 392 V = buildNonAtomicBinOp( 393 B, Op, V, 394 B.CreateCall(UpdateDPP, 395 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 396 B.getInt32(0xf), B.getFalse()})); 397 } else { 398 // On GFX10 all DPP operations are confined to a single row. To get cross- 399 // row operations we have to use permlane or readlane. 400 401 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 402 // 48..63). 403 assert(ST->hasPermLaneX16()); 404 Value *const PermX = B.CreateIntrinsic( 405 Intrinsic::amdgcn_permlanex16, {}, 406 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 407 V = buildNonAtomicBinOp( 408 B, Op, V, 409 B.CreateCall(UpdateDPP, 410 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 411 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 412 if (!ST->isWave32()) { 413 // Combine lane 31 into lanes 32..63. 414 Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 415 {V, B.getInt32(31)}); 416 V = buildNonAtomicBinOp( 417 B, Op, V, 418 B.CreateCall(UpdateDPP, 419 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 420 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 421 } 422 } 423 return V; 424 } 425 426 // Use the builder to create a shift right of V across the wavefront, with all 427 // lanes active, to turn an inclusive scan into an exclusive scan. 428 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 429 Value *const Identity) const { 430 Type *const Ty = V->getType(); 431 Module *M = B.GetInsertBlock()->getModule(); 432 Function *UpdateDPP = 433 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 434 435 if (ST->hasDPPWavefrontShifts()) { 436 // GFX9 has DPP wavefront shift operations. 437 V = B.CreateCall(UpdateDPP, 438 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 439 B.getInt32(0xf), B.getFalse()}); 440 } else { 441 Function *ReadLane = 442 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 443 Function *WriteLane = 444 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 445 446 // On GFX10 all DPP operations are confined to a single row. To get cross- 447 // row operations we have to use permlane or readlane. 448 Value *Old = V; 449 V = B.CreateCall(UpdateDPP, 450 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 451 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 452 453 // Copy the old lane 15 to the new lane 16. 454 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 455 B.getInt32(16), V}); 456 457 if (!ST->isWave32()) { 458 // Copy the old lane 31 to the new lane 32. 459 V = B.CreateCall( 460 WriteLane, 461 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 462 463 // Copy the old lane 47 to the new lane 48. 464 V = B.CreateCall( 465 WriteLane, 466 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 467 } 468 } 469 470 return V; 471 } 472 473 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 474 unsigned BitWidth) { 475 switch (Op) { 476 default: 477 llvm_unreachable("Unhandled atomic op"); 478 case AtomicRMWInst::Add: 479 case AtomicRMWInst::Sub: 480 case AtomicRMWInst::Or: 481 case AtomicRMWInst::Xor: 482 case AtomicRMWInst::UMax: 483 return APInt::getMinValue(BitWidth); 484 case AtomicRMWInst::And: 485 case AtomicRMWInst::UMin: 486 return APInt::getMaxValue(BitWidth); 487 case AtomicRMWInst::Max: 488 return APInt::getSignedMinValue(BitWidth); 489 case AtomicRMWInst::Min: 490 return APInt::getSignedMaxValue(BitWidth); 491 } 492 } 493 494 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 495 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 496 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 497 } 498 499 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 500 AtomicRMWInst::BinOp Op, 501 unsigned ValIdx, 502 bool ValDivergent) const { 503 // Start building just before the instruction. 504 IRBuilder<> B(&I); 505 506 // If we are in a pixel shader, because of how we have to mask out helper 507 // lane invocations, we need to record the entry and exit BB's. 508 BasicBlock *PixelEntryBB = nullptr; 509 BasicBlock *PixelExitBB = nullptr; 510 511 // If we're optimizing an atomic within a pixel shader, we need to wrap the 512 // entire atomic operation in a helper-lane check. We do not want any helper 513 // lanes that are around only for the purposes of derivatives to take part 514 // in any cross-lane communication, and we use a branch on whether the lane is 515 // live to do this. 516 if (IsPixelShader) { 517 // Record I's original position as the entry block. 518 PixelEntryBB = I.getParent(); 519 520 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 521 Instruction *const NonHelperTerminator = 522 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 523 524 // Record I's new position as the exit block. 525 PixelExitBB = I.getParent(); 526 527 I.moveBefore(NonHelperTerminator); 528 B.SetInsertPoint(&I); 529 } 530 531 Type *const Ty = I.getType(); 532 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 533 auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 534 535 // This is the value in the atomic operation we need to combine in order to 536 // reduce the number of atomic operations. 537 Value *const V = I.getOperand(ValIdx); 538 539 // We need to know how many lanes are active within the wavefront, and we do 540 // this by doing a ballot of active lanes. 541 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 542 CallInst *const Ballot = 543 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 544 545 // We need to know how many lanes are active within the wavefront that are 546 // below us. If we counted each lane linearly starting from 0, a lane is 547 // below us only if its associated index was less than ours. We do this by 548 // using the mbcnt intrinsic. 549 Value *Mbcnt; 550 if (ST->isWave32()) { 551 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 552 {Ballot, B.getInt32(0)}); 553 } else { 554 Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 555 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 556 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 557 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 558 {ExtractLo, B.getInt32(0)}); 559 Mbcnt = 560 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 561 } 562 Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 563 564 Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 565 566 Value *ExclScan = nullptr; 567 Value *NewV = nullptr; 568 569 const bool NeedResult = !I.use_empty(); 570 571 // If we have a divergent value in each lane, we need to combine the value 572 // using DPP. 573 if (ValDivergent) { 574 // First we need to set all inactive invocations to the identity value, so 575 // that they can correctly contribute to the final result. 576 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 577 578 const AtomicRMWInst::BinOp ScanOp = 579 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 580 if (!NeedResult && ST->hasPermLaneX16()) { 581 // On GFX10 the permlanex16 instruction helps us build a reduction without 582 // too many readlanes and writelanes, which are generally bad for 583 // performance. 584 NewV = buildReduction(B, ScanOp, NewV, Identity); 585 } else { 586 NewV = buildScan(B, ScanOp, NewV, Identity); 587 if (NeedResult) 588 ExclScan = buildShiftRight(B, NewV, Identity); 589 590 // Read the value from the last lane, which has accumulated the values of 591 // each active lane in the wavefront. This will be our new value which we 592 // will provide to the atomic operation. 593 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 594 assert(TyBitWidth == 32); 595 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 596 {NewV, LastLaneIdx}); 597 } 598 599 // Finally mark the readlanes in the WWM section. 600 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 601 } else { 602 switch (Op) { 603 default: 604 llvm_unreachable("Unhandled atomic op"); 605 606 case AtomicRMWInst::Add: 607 case AtomicRMWInst::Sub: { 608 // The new value we will be contributing to the atomic operation is the 609 // old value times the number of active lanes. 610 Value *const Ctpop = B.CreateIntCast( 611 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 612 NewV = buildMul(B, V, Ctpop); 613 break; 614 } 615 616 case AtomicRMWInst::And: 617 case AtomicRMWInst::Or: 618 case AtomicRMWInst::Max: 619 case AtomicRMWInst::Min: 620 case AtomicRMWInst::UMax: 621 case AtomicRMWInst::UMin: 622 // These operations with a uniform value are idempotent: doing the atomic 623 // operation multiple times has the same effect as doing it once. 624 NewV = V; 625 break; 626 627 case AtomicRMWInst::Xor: 628 // The new value we will be contributing to the atomic operation is the 629 // old value times the parity of the number of active lanes. 630 Value *const Ctpop = B.CreateIntCast( 631 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 632 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 633 break; 634 } 635 } 636 637 // We only want a single lane to enter our new control flow, and we do this 638 // by checking if there are any active lanes below us. Only one lane will 639 // have 0 active lanes below us, so that will be the only one to progress. 640 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 641 642 // Store I's original basic block before we split the block. 643 BasicBlock *const EntryBB = I.getParent(); 644 645 // We need to introduce some new control flow to force a single lane to be 646 // active. We do this by splitting I's basic block at I, and introducing the 647 // new block such that: 648 // entry --> single_lane -\ 649 // \------------------> exit 650 Instruction *const SingleLaneTerminator = 651 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 652 653 // Move the IR builder into single_lane next. 654 B.SetInsertPoint(SingleLaneTerminator); 655 656 // Clone the original atomic operation into single lane, replacing the 657 // original value with our newly created one. 658 Instruction *const NewI = I.clone(); 659 B.Insert(NewI); 660 NewI->setOperand(ValIdx, NewV); 661 662 // Move the IR builder into exit next, and start inserting just before the 663 // original instruction. 664 B.SetInsertPoint(&I); 665 666 if (NeedResult) { 667 // Create a PHI node to get our new atomic result into the exit block. 668 PHINode *const PHI = B.CreatePHI(Ty, 2); 669 PHI->addIncoming(PoisonValue::get(Ty), EntryBB); 670 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 671 672 // We need to broadcast the value who was the lowest active lane (the first 673 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 674 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 675 Value *BroadcastI = nullptr; 676 677 if (TyBitWidth == 64) { 678 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 679 Value *const ExtractHi = 680 B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 681 CallInst *const ReadFirstLaneLo = 682 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 683 CallInst *const ReadFirstLaneHi = 684 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 685 Value *const PartialInsert = B.CreateInsertElement( 686 PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 687 Value *const Insert = 688 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 689 BroadcastI = B.CreateBitCast(Insert, Ty); 690 } else if (TyBitWidth == 32) { 691 692 BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 693 } else { 694 llvm_unreachable("Unhandled atomic bit width"); 695 } 696 697 // Now that we have the result of our single atomic operation, we need to 698 // get our individual lane's slice into the result. We use the lane offset 699 // we previously calculated combined with the atomic result value we got 700 // from the first lane, to get our lane's index into the atomic result. 701 Value *LaneOffset = nullptr; 702 if (ValDivergent) { 703 LaneOffset = 704 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 705 } else { 706 switch (Op) { 707 default: 708 llvm_unreachable("Unhandled atomic op"); 709 case AtomicRMWInst::Add: 710 case AtomicRMWInst::Sub: 711 LaneOffset = buildMul(B, V, Mbcnt); 712 break; 713 case AtomicRMWInst::And: 714 case AtomicRMWInst::Or: 715 case AtomicRMWInst::Max: 716 case AtomicRMWInst::Min: 717 case AtomicRMWInst::UMax: 718 case AtomicRMWInst::UMin: 719 LaneOffset = B.CreateSelect(Cond, Identity, V); 720 break; 721 case AtomicRMWInst::Xor: 722 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 723 break; 724 } 725 } 726 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 727 728 if (IsPixelShader) { 729 // Need a final PHI to reconverge to above the helper lane branch mask. 730 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 731 732 PHINode *const PHI = B.CreatePHI(Ty, 2); 733 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 734 PHI->addIncoming(Result, I.getParent()); 735 I.replaceAllUsesWith(PHI); 736 } else { 737 // Replace the original atomic instruction with the new one. 738 I.replaceAllUsesWith(Result); 739 } 740 } 741 742 // And delete the original. 743 I.eraseFromParent(); 744 } 745 746 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 747 "AMDGPU atomic optimizations", false, false) 748 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 749 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 750 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 751 "AMDGPU atomic optimizations", false, false) 752 753 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 754 return new AMDGPUAtomicOptimizer(); 755 } 756