1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getParent()->getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getParent()->getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 // Scan option None disables the Pass 164 if (ScanImpl == ScanOptions::None) { 165 return false; 166 } 167 168 visit(F); 169 170 const bool Changed = !ToReplace.empty(); 171 172 for (ReplacementInfo &Info : ToReplace) { 173 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 174 } 175 176 ToReplace.clear(); 177 178 return Changed; 179 } 180 181 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 182 // Early exit for unhandled address space atomic instructions. 183 switch (I.getPointerAddressSpace()) { 184 default: 185 return; 186 case AMDGPUAS::GLOBAL_ADDRESS: 187 case AMDGPUAS::LOCAL_ADDRESS: 188 break; 189 } 190 191 AtomicRMWInst::BinOp Op = I.getOperation(); 192 193 switch (Op) { 194 default: 195 return; 196 case AtomicRMWInst::Add: 197 case AtomicRMWInst::Sub: 198 case AtomicRMWInst::And: 199 case AtomicRMWInst::Or: 200 case AtomicRMWInst::Xor: 201 case AtomicRMWInst::Max: 202 case AtomicRMWInst::Min: 203 case AtomicRMWInst::UMax: 204 case AtomicRMWInst::UMin: 205 case AtomicRMWInst::FAdd: 206 case AtomicRMWInst::FSub: 207 break; 208 } 209 210 // Only 32-bit floating point atomic ops are supported. 211 if (AtomicRMWInst::isFPOperation(Op) && !I.getType()->isFloatTy()) { 212 return; 213 } 214 215 const unsigned PtrIdx = 0; 216 const unsigned ValIdx = 1; 217 218 // If the pointer operand is divergent, then each lane is doing an atomic 219 // operation on a different address, and we cannot optimize that. 220 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 221 return; 222 } 223 224 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 225 226 // If the value operand is divergent, each lane is contributing a different 227 // value to the atomic calculation. We can only optimize divergent values if 228 // we have DPP available on our subtarget, and the atomic operation is 32 229 // bits. 230 if (ValDivergent && 231 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 232 return; 233 } 234 235 // If we get here, we can optimize the atomic using a single wavefront-wide 236 // atomic operation to do the calculation for the entire wavefront, so 237 // remember the instruction so we can come back to it. 238 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 239 240 ToReplace.push_back(Info); 241 } 242 243 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 244 AtomicRMWInst::BinOp Op; 245 246 switch (I.getIntrinsicID()) { 247 default: 248 return; 249 case Intrinsic::amdgcn_buffer_atomic_add: 250 case Intrinsic::amdgcn_struct_buffer_atomic_add: 251 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 252 case Intrinsic::amdgcn_raw_buffer_atomic_add: 253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 254 Op = AtomicRMWInst::Add; 255 break; 256 case Intrinsic::amdgcn_buffer_atomic_sub: 257 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 259 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 261 Op = AtomicRMWInst::Sub; 262 break; 263 case Intrinsic::amdgcn_buffer_atomic_and: 264 case Intrinsic::amdgcn_struct_buffer_atomic_and: 265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 266 case Intrinsic::amdgcn_raw_buffer_atomic_and: 267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 268 Op = AtomicRMWInst::And; 269 break; 270 case Intrinsic::amdgcn_buffer_atomic_or: 271 case Intrinsic::amdgcn_struct_buffer_atomic_or: 272 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 273 case Intrinsic::amdgcn_raw_buffer_atomic_or: 274 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 275 Op = AtomicRMWInst::Or; 276 break; 277 case Intrinsic::amdgcn_buffer_atomic_xor: 278 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 280 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 282 Op = AtomicRMWInst::Xor; 283 break; 284 case Intrinsic::amdgcn_buffer_atomic_smin: 285 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 286 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 287 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 289 Op = AtomicRMWInst::Min; 290 break; 291 case Intrinsic::amdgcn_buffer_atomic_umin: 292 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 294 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 296 Op = AtomicRMWInst::UMin; 297 break; 298 case Intrinsic::amdgcn_buffer_atomic_smax: 299 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 301 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 302 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 303 Op = AtomicRMWInst::Max; 304 break; 305 case Intrinsic::amdgcn_buffer_atomic_umax: 306 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 308 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 310 Op = AtomicRMWInst::UMax; 311 break; 312 } 313 314 const unsigned ValIdx = 0; 315 316 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 317 318 // If the value operand is divergent, each lane is contributing a different 319 // value to the atomic calculation. We can only optimize divergent values if 320 // we have DPP available on our subtarget, and the atomic operation is 32 321 // bits. 322 if (ValDivergent && 323 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 324 return; 325 } 326 327 // If any of the other arguments to the intrinsic are divergent, we can't 328 // optimize the operation. 329 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 330 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 331 return; 332 } 333 } 334 335 // If we get here, we can optimize the atomic using a single wavefront-wide 336 // atomic operation to do the calculation for the entire wavefront, so 337 // remember the instruction so we can come back to it. 338 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 339 340 ToReplace.push_back(Info); 341 } 342 343 // Use the builder to create the non-atomic counterpart of the specified 344 // atomicrmw binary op. 345 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 346 Value *LHS, Value *RHS) { 347 CmpInst::Predicate Pred; 348 349 switch (Op) { 350 default: 351 llvm_unreachable("Unhandled atomic op"); 352 case AtomicRMWInst::Add: 353 return B.CreateBinOp(Instruction::Add, LHS, RHS); 354 case AtomicRMWInst::FAdd: 355 return B.CreateFAdd(LHS, RHS); 356 case AtomicRMWInst::Sub: 357 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 358 case AtomicRMWInst::FSub: 359 return B.CreateFSub(LHS, RHS); 360 case AtomicRMWInst::And: 361 return B.CreateBinOp(Instruction::And, LHS, RHS); 362 case AtomicRMWInst::Or: 363 return B.CreateBinOp(Instruction::Or, LHS, RHS); 364 case AtomicRMWInst::Xor: 365 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 366 367 case AtomicRMWInst::Max: 368 Pred = CmpInst::ICMP_SGT; 369 break; 370 case AtomicRMWInst::Min: 371 Pred = CmpInst::ICMP_SLT; 372 break; 373 case AtomicRMWInst::UMax: 374 Pred = CmpInst::ICMP_UGT; 375 break; 376 case AtomicRMWInst::UMin: 377 Pred = CmpInst::ICMP_ULT; 378 break; 379 } 380 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 381 return B.CreateSelect(Cond, LHS, RHS); 382 } 383 384 // Use the builder to create a reduction of V across the wavefront, with all 385 // lanes active, returning the same result in all lanes. 386 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 387 AtomicRMWInst::BinOp Op, 388 Value *V, 389 Value *const Identity) const { 390 Type *AtomicTy = V->getType(); 391 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 392 Module *M = B.GetInsertBlock()->getModule(); 393 Function *UpdateDPP = 394 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 395 396 // Reduce within each row of 16 lanes. 397 for (unsigned Idx = 0; Idx < 4; Idx++) { 398 V = buildNonAtomicBinOp( 399 B, Op, V, 400 B.CreateCall(UpdateDPP, 401 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 402 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 403 } 404 405 // Reduce within each pair of rows (i.e. 32 lanes). 406 assert(ST->hasPermLaneX16()); 407 V = B.CreateBitCast(V, IntNTy); 408 Value *Permlanex16Call = B.CreateIntrinsic( 409 Intrinsic::amdgcn_permlanex16, {}, 410 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 411 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 412 B.CreateBitCast(Permlanex16Call, AtomicTy)); 413 if (ST->isWave32()) { 414 return V; 415 } 416 417 if (ST->hasPermLane64()) { 418 // Reduce across the upper and lower 32 lanes. 419 V = B.CreateBitCast(V, IntNTy); 420 Value *Permlane64Call = 421 B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); 422 return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 423 B.CreateBitCast(Permlane64Call, AtomicTy)); 424 } 425 426 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 427 // combine them with a scalar operation. 428 Function *ReadLane = 429 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 430 V = B.CreateBitCast(V, IntNTy); 431 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 432 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 433 return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy), 434 B.CreateBitCast(Lane32, AtomicTy)); 435 } 436 437 // Use the builder to create an inclusive scan of V across the wavefront, with 438 // all lanes active. 439 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 440 AtomicRMWInst::BinOp Op, Value *V, 441 Value *Identity) const { 442 Type *AtomicTy = V->getType(); 443 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 444 445 Module *M = B.GetInsertBlock()->getModule(); 446 Function *UpdateDPP = 447 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 448 449 for (unsigned Idx = 0; Idx < 4; Idx++) { 450 V = buildNonAtomicBinOp( 451 B, Op, V, 452 B.CreateCall(UpdateDPP, 453 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 454 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 455 } 456 if (ST->hasDPPBroadcasts()) { 457 // GFX9 has DPP row broadcast operations. 458 V = buildNonAtomicBinOp( 459 B, Op, V, 460 B.CreateCall(UpdateDPP, 461 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 462 B.getInt32(0xf), B.getFalse()})); 463 V = buildNonAtomicBinOp( 464 B, Op, V, 465 B.CreateCall(UpdateDPP, 466 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 467 B.getInt32(0xf), B.getFalse()})); 468 } else { 469 // On GFX10 all DPP operations are confined to a single row. To get cross- 470 // row operations we have to use permlane or readlane. 471 472 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 473 // 48..63). 474 assert(ST->hasPermLaneX16()); 475 V = B.CreateBitCast(V, IntNTy); 476 Value *PermX = B.CreateIntrinsic( 477 Intrinsic::amdgcn_permlanex16, {}, 478 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 479 480 Value *UpdateDPPCall = 481 B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy), 482 B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa), 483 B.getInt32(0xf), B.getFalse()}); 484 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); 485 486 if (!ST->isWave32()) { 487 // Combine lane 31 into lanes 32..63. 488 V = B.CreateBitCast(V, IntNTy); 489 Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 490 {V, B.getInt32(31)}); 491 492 Value *UpdateDPPCall = B.CreateCall( 493 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 494 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 495 496 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 497 UpdateDPPCall); 498 } 499 } 500 return V; 501 } 502 503 // Use the builder to create a shift right of V across the wavefront, with all 504 // lanes active, to turn an inclusive scan into an exclusive scan. 505 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 506 Value *Identity) const { 507 Type *AtomicTy = V->getType(); 508 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 509 510 Module *M = B.GetInsertBlock()->getModule(); 511 Function *UpdateDPP = 512 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 513 if (ST->hasDPPWavefrontShifts()) { 514 // GFX9 has DPP wavefront shift operations. 515 V = B.CreateCall(UpdateDPP, 516 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 517 B.getInt32(0xf), B.getFalse()}); 518 } else { 519 Function *ReadLane = 520 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 521 Function *WriteLane = 522 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 523 524 // On GFX10 all DPP operations are confined to a single row. To get cross- 525 // row operations we have to use permlane or readlane. 526 Value *Old = V; 527 V = B.CreateCall(UpdateDPP, 528 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 529 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 530 531 // Copy the old lane 15 to the new lane 16. 532 V = B.CreateCall( 533 WriteLane, 534 {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}), 535 B.getInt32(16), B.CreateBitCast(V, IntNTy)}); 536 V = B.CreateBitCast(V, AtomicTy); 537 if (!ST->isWave32()) { 538 // Copy the old lane 31 to the new lane 32. 539 V = B.CreateBitCast(V, IntNTy); 540 V = B.CreateCall(WriteLane, 541 {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), 542 B.getInt32(31)}), 543 B.getInt32(32), V}); 544 545 // Copy the old lane 47 to the new lane 48. 546 V = B.CreateCall( 547 WriteLane, 548 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 549 V = B.CreateBitCast(V, AtomicTy); 550 } 551 } 552 553 return V; 554 } 555 556 // Use the builder to create an exclusive scan and compute the final reduced 557 // value using an iterative approach. This provides an alternative 558 // implementation to DPP which uses WMM for scan computations. This API iterate 559 // over active lanes to read, compute and update the value using 560 // readlane and writelane intrinsics. 561 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 562 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 563 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 564 auto *Ty = I.getType(); 565 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 566 auto *EntryBB = I.getParent(); 567 auto NeedResult = !I.use_empty(); 568 569 auto *Ballot = 570 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 571 572 // Start inserting instructions for ComputeLoop block 573 B.SetInsertPoint(ComputeLoop); 574 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 575 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 576 Accumulator->addIncoming(Identity, EntryBB); 577 PHINode *OldValuePhi = nullptr; 578 if (NeedResult) { 579 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 580 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 581 } 582 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 583 ActiveBits->addIncoming(Ballot, EntryBB); 584 585 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 586 auto *FF1 = 587 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 588 589 Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); 590 auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy); 591 592 // Get the value required for atomic operation 593 V = B.CreateBitCast(V, IntNTy); 594 Value *LaneValue = 595 B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); 596 LaneValue = B.CreateBitCast(LaneValue, Ty); 597 598 // Perform writelane if intermediate scan results are required later in the 599 // kernel computations 600 Value *OldValue = nullptr; 601 if (NeedResult) { 602 OldValue = 603 B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, 604 {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt, 605 B.CreateBitCast(OldValuePhi, IntNTy)}); 606 OldValue = B.CreateBitCast(OldValue, Ty); 607 OldValuePhi->addIncoming(OldValue, ComputeLoop); 608 } 609 610 // Accumulate the results 611 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 612 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 613 614 // Set bit to zero of current active lane so that for next iteration llvm.cttz 615 // return the next active lane 616 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 617 618 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 619 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 620 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 621 622 // Branch out of the loop when all lanes are processed. 623 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 624 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 625 626 B.SetInsertPoint(ComputeEnd); 627 628 return {OldValue, NewAccumulator}; 629 } 630 631 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 632 AtomicRMWInst::BinOp Op) { 633 LLVMContext &C = Ty->getContext(); 634 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 635 switch (Op) { 636 default: 637 llvm_unreachable("Unhandled atomic op"); 638 case AtomicRMWInst::Add: 639 case AtomicRMWInst::Sub: 640 case AtomicRMWInst::Or: 641 case AtomicRMWInst::Xor: 642 case AtomicRMWInst::UMax: 643 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 644 case AtomicRMWInst::And: 645 case AtomicRMWInst::UMin: 646 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 647 case AtomicRMWInst::Max: 648 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 649 case AtomicRMWInst::Min: 650 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 651 case AtomicRMWInst::FAdd: 652 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 653 case AtomicRMWInst::FSub: 654 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 655 } 656 } 657 658 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 659 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 660 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 661 } 662 663 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 664 AtomicRMWInst::BinOp Op, 665 unsigned ValIdx, 666 bool ValDivergent) const { 667 // Start building just before the instruction. 668 IRBuilder<> B(&I); 669 670 if (AtomicRMWInst::isFPOperation(Op)) { 671 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 672 } 673 674 // If we are in a pixel shader, because of how we have to mask out helper 675 // lane invocations, we need to record the entry and exit BB's. 676 BasicBlock *PixelEntryBB = nullptr; 677 BasicBlock *PixelExitBB = nullptr; 678 679 // If we're optimizing an atomic within a pixel shader, we need to wrap the 680 // entire atomic operation in a helper-lane check. We do not want any helper 681 // lanes that are around only for the purposes of derivatives to take part 682 // in any cross-lane communication, and we use a branch on whether the lane is 683 // live to do this. 684 if (IsPixelShader) { 685 // Record I's original position as the entry block. 686 PixelEntryBB = I.getParent(); 687 688 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 689 Instruction *const NonHelperTerminator = 690 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 691 692 // Record I's new position as the exit block. 693 PixelExitBB = I.getParent(); 694 695 I.moveBefore(NonHelperTerminator); 696 B.SetInsertPoint(&I); 697 } 698 699 Type *const Ty = I.getType(); 700 Type *Int32Ty = B.getInt32Ty(); 701 Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); 702 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 703 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 704 auto *const VecTy = FixedVectorType::get(Int32Ty, 2); 705 706 // This is the value in the atomic operation we need to combine in order to 707 // reduce the number of atomic operations. 708 Value *V = I.getOperand(ValIdx); 709 710 // We need to know how many lanes are active within the wavefront, and we do 711 // this by doing a ballot of active lanes. 712 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 713 CallInst *const Ballot = 714 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 715 716 // We need to know how many lanes are active within the wavefront that are 717 // below us. If we counted each lane linearly starting from 0, a lane is 718 // below us only if its associated index was less than ours. We do this by 719 // using the mbcnt intrinsic. 720 Value *Mbcnt; 721 if (ST->isWave32()) { 722 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 723 {Ballot, B.getInt32(0)}); 724 } else { 725 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 726 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 727 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 728 {ExtractLo, B.getInt32(0)}); 729 Mbcnt = 730 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 731 } 732 733 Function *F = I.getFunction(); 734 LLVMContext &C = F->getContext(); 735 Value *Identity = getIdentityValueForAtomicOp(Ty, Op); 736 737 Value *ExclScan = nullptr; 738 Value *NewV = nullptr; 739 740 const bool NeedResult = !I.use_empty(); 741 742 BasicBlock *ComputeLoop = nullptr; 743 BasicBlock *ComputeEnd = nullptr; 744 // If we have a divergent value in each lane, we need to combine the value 745 // using DPP. 746 if (ValDivergent) { 747 const AtomicRMWInst::BinOp ScanOp = 748 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 749 if (ScanImpl == ScanOptions::DPP) { 750 // First we need to set all inactive invocations to the identity value, so 751 // that they can correctly contribute to the final result. 752 V = B.CreateBitCast(V, IntNTy); 753 Identity = B.CreateBitCast(Identity, IntNTy); 754 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy, 755 {V, Identity}); 756 NewV = B.CreateBitCast(NewV, Ty); 757 V = B.CreateBitCast(V, Ty); 758 Identity = B.CreateBitCast(Identity, Ty); 759 const AtomicRMWInst::BinOp ScanOp = 760 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 761 if (!NeedResult && ST->hasPermLaneX16()) { 762 // On GFX10 the permlanex16 instruction helps us build a reduction 763 // without too many readlanes and writelanes, which are generally bad 764 // for performance. 765 NewV = buildReduction(B, ScanOp, NewV, Identity); 766 } else { 767 NewV = buildScan(B, ScanOp, NewV, Identity); 768 if (NeedResult) 769 ExclScan = buildShiftRight(B, NewV, Identity); 770 // Read the value from the last lane, which has accumulated the values 771 // of each active lane in the wavefront. This will be our new value 772 // which we will provide to the atomic operation. 773 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 774 assert(TyBitWidth == 32); 775 NewV = B.CreateBitCast(NewV, IntNTy); 776 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 777 {NewV, LastLaneIdx}); 778 NewV = B.CreateBitCast(NewV, Ty); 779 } 780 // Finally mark the readlanes in the WWM section. 781 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 782 } else if (ScanImpl == ScanOptions::Iterative) { 783 // Alternative implementation for scan 784 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 785 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 786 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 787 ComputeLoop, ComputeEnd); 788 } else { 789 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 790 } 791 } else { 792 switch (Op) { 793 default: 794 llvm_unreachable("Unhandled atomic op"); 795 796 case AtomicRMWInst::Add: 797 case AtomicRMWInst::Sub: { 798 // The new value we will be contributing to the atomic operation is the 799 // old value times the number of active lanes. 800 Value *const Ctpop = B.CreateIntCast( 801 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 802 NewV = buildMul(B, V, Ctpop); 803 break; 804 } 805 case AtomicRMWInst::FAdd: 806 case AtomicRMWInst::FSub: { 807 Value *const Ctpop = B.CreateIntCast( 808 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 809 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 810 NewV = B.CreateFMul(V, CtpopFP); 811 break; 812 } 813 case AtomicRMWInst::And: 814 case AtomicRMWInst::Or: 815 case AtomicRMWInst::Max: 816 case AtomicRMWInst::Min: 817 case AtomicRMWInst::UMax: 818 case AtomicRMWInst::UMin: 819 // These operations with a uniform value are idempotent: doing the atomic 820 // operation multiple times has the same effect as doing it once. 821 NewV = V; 822 break; 823 824 case AtomicRMWInst::Xor: 825 // The new value we will be contributing to the atomic operation is the 826 // old value times the parity of the number of active lanes. 827 Value *const Ctpop = B.CreateIntCast( 828 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 829 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 830 break; 831 } 832 } 833 834 // We only want a single lane to enter our new control flow, and we do this 835 // by checking if there are any active lanes below us. Only one lane will 836 // have 0 active lanes below us, so that will be the only one to progress. 837 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 838 839 // Store I's original basic block before we split the block. 840 BasicBlock *const EntryBB = I.getParent(); 841 842 // We need to introduce some new control flow to force a single lane to be 843 // active. We do this by splitting I's basic block at I, and introducing the 844 // new block such that: 845 // entry --> single_lane -\ 846 // \------------------> exit 847 Instruction *const SingleLaneTerminator = 848 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 849 850 // At this point, we have split the I's block to allow one lane in wavefront 851 // to update the precomputed reduced value. Also, completed the codegen for 852 // new control flow i.e. iterative loop which perform reduction and scan using 853 // ComputeLoop and ComputeEnd. 854 // For the new control flow, we need to move branch instruction i.e. 855 // terminator created during SplitBlockAndInsertIfThen from I's block to 856 // ComputeEnd block. We also need to set up predecessor to next block when 857 // single lane done updating the final reduced value. 858 BasicBlock *Predecessor = nullptr; 859 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 860 // Move terminator from I's block to ComputeEnd block. 861 Instruction *Terminator = EntryBB->getTerminator(); 862 B.SetInsertPoint(ComputeEnd); 863 Terminator->removeFromParent(); 864 B.Insert(Terminator); 865 866 // Branch to ComputeLoop Block unconditionally from the I's block for 867 // iterative approach. 868 B.SetInsertPoint(EntryBB); 869 B.CreateBr(ComputeLoop); 870 871 // Update the dominator tree for new control flow. 872 DTU.applyUpdates( 873 {{DominatorTree::Insert, EntryBB, ComputeLoop}, 874 {DominatorTree::Insert, ComputeLoop, ComputeEnd}, 875 {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}}); 876 877 Predecessor = ComputeEnd; 878 } else { 879 Predecessor = EntryBB; 880 } 881 // Move the IR builder into single_lane next. 882 B.SetInsertPoint(SingleLaneTerminator); 883 884 // Clone the original atomic operation into single lane, replacing the 885 // original value with our newly created one. 886 Instruction *const NewI = I.clone(); 887 B.Insert(NewI); 888 NewI->setOperand(ValIdx, NewV); 889 890 // Move the IR builder into exit next, and start inserting just before the 891 // original instruction. 892 B.SetInsertPoint(&I); 893 894 if (NeedResult) { 895 // Create a PHI node to get our new atomic result into the exit block. 896 PHINode *const PHI = B.CreatePHI(Ty, 2); 897 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 898 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 899 900 // We need to broadcast the value who was the lowest active lane (the first 901 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 902 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 903 Value *BroadcastI = nullptr; 904 905 if (TyBitWidth == 64) { 906 Value *const ExtractLo = B.CreateTrunc(PHI, Int32Ty); 907 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(PHI, 32), Int32Ty); 908 CallInst *const ReadFirstLaneLo = 909 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 910 CallInst *const ReadFirstLaneHi = 911 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 912 Value *const PartialInsert = B.CreateInsertElement( 913 PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 914 Value *const Insert = 915 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 916 BroadcastI = B.CreateBitCast(Insert, Ty); 917 } else if (TyBitWidth == 32) { 918 Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); 919 BroadcastI = 920 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi); 921 BroadcastI = B.CreateBitCast(BroadcastI, Ty); 922 923 } else { 924 llvm_unreachable("Unhandled atomic bit width"); 925 } 926 927 // Now that we have the result of our single atomic operation, we need to 928 // get our individual lane's slice into the result. We use the lane offset 929 // we previously calculated combined with the atomic result value we got 930 // from the first lane, to get our lane's index into the atomic result. 931 Value *LaneOffset = nullptr; 932 if (ValDivergent) { 933 if (ScanImpl == ScanOptions::DPP) { 934 LaneOffset = 935 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 936 } else if (ScanImpl == ScanOptions::Iterative) { 937 LaneOffset = ExclScan; 938 } else { 939 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 940 } 941 } else { 942 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 943 : B.CreateIntCast(Mbcnt, Ty, false); 944 switch (Op) { 945 default: 946 llvm_unreachable("Unhandled atomic op"); 947 case AtomicRMWInst::Add: 948 case AtomicRMWInst::Sub: 949 LaneOffset = buildMul(B, V, Mbcnt); 950 break; 951 case AtomicRMWInst::And: 952 case AtomicRMWInst::Or: 953 case AtomicRMWInst::Max: 954 case AtomicRMWInst::Min: 955 case AtomicRMWInst::UMax: 956 case AtomicRMWInst::UMin: 957 LaneOffset = B.CreateSelect(Cond, Identity, V); 958 break; 959 case AtomicRMWInst::Xor: 960 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 961 break; 962 case AtomicRMWInst::FAdd: 963 case AtomicRMWInst::FSub: { 964 LaneOffset = B.CreateFMul(V, Mbcnt); 965 break; 966 } 967 } 968 } 969 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 970 971 if (IsPixelShader) { 972 // Need a final PHI to reconverge to above the helper lane branch mask. 973 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 974 975 PHINode *const PHI = B.CreatePHI(Ty, 2); 976 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 977 PHI->addIncoming(Result, I.getParent()); 978 I.replaceAllUsesWith(PHI); 979 } else { 980 // Replace the original atomic instruction with the new one. 981 I.replaceAllUsesWith(Result); 982 } 983 } 984 985 // And delete the original. 986 I.eraseFromParent(); 987 } 988 989 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 990 "AMDGPU atomic optimizations", false, false) 991 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 992 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 993 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 994 "AMDGPU atomic optimizations", false, false) 995 996 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 997 return new AMDGPUAtomicOptimizer(ScanStrategy); 998 } 999