1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 // Scan option None disables the Pass 164 if (ScanImpl == ScanOptions::None) { 165 return false; 166 } 167 168 visit(F); 169 170 const bool Changed = !ToReplace.empty(); 171 172 for (ReplacementInfo &Info : ToReplace) { 173 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 174 } 175 176 ToReplace.clear(); 177 178 return Changed; 179 } 180 181 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 182 // Early exit for unhandled address space atomic instructions. 183 switch (I.getPointerAddressSpace()) { 184 default: 185 return; 186 case AMDGPUAS::GLOBAL_ADDRESS: 187 case AMDGPUAS::LOCAL_ADDRESS: 188 break; 189 } 190 191 AtomicRMWInst::BinOp Op = I.getOperation(); 192 193 switch (Op) { 194 default: 195 return; 196 case AtomicRMWInst::Add: 197 case AtomicRMWInst::Sub: 198 case AtomicRMWInst::And: 199 case AtomicRMWInst::Or: 200 case AtomicRMWInst::Xor: 201 case AtomicRMWInst::Max: 202 case AtomicRMWInst::Min: 203 case AtomicRMWInst::UMax: 204 case AtomicRMWInst::UMin: 205 case AtomicRMWInst::FAdd: 206 case AtomicRMWInst::FSub: 207 case AtomicRMWInst::FMax: 208 case AtomicRMWInst::FMin: 209 break; 210 } 211 212 // Only 32 and 64 bit floating point atomic ops are supported. 213 if (AtomicRMWInst::isFPOperation(Op) && 214 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 215 return; 216 } 217 218 const unsigned PtrIdx = 0; 219 const unsigned ValIdx = 1; 220 221 // If the pointer operand is divergent, then each lane is doing an atomic 222 // operation on a different address, and we cannot optimize that. 223 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 224 return; 225 } 226 227 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 228 229 // If the value operand is divergent, each lane is contributing a different 230 // value to the atomic calculation. We can only optimize divergent values if 231 // we have DPP available on our subtarget, and the atomic operation is 32 232 // bits. 233 if (ValDivergent && 234 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 235 return; 236 } 237 238 // If we get here, we can optimize the atomic using a single wavefront-wide 239 // atomic operation to do the calculation for the entire wavefront, so 240 // remember the instruction so we can come back to it. 241 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 242 243 ToReplace.push_back(Info); 244 } 245 246 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 247 AtomicRMWInst::BinOp Op; 248 249 switch (I.getIntrinsicID()) { 250 default: 251 return; 252 case Intrinsic::amdgcn_struct_buffer_atomic_add: 253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 254 case Intrinsic::amdgcn_raw_buffer_atomic_add: 255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 256 Op = AtomicRMWInst::Add; 257 break; 258 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 260 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 261 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 262 Op = AtomicRMWInst::Sub; 263 break; 264 case Intrinsic::amdgcn_struct_buffer_atomic_and: 265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 266 case Intrinsic::amdgcn_raw_buffer_atomic_and: 267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 268 Op = AtomicRMWInst::And; 269 break; 270 case Intrinsic::amdgcn_struct_buffer_atomic_or: 271 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 272 case Intrinsic::amdgcn_raw_buffer_atomic_or: 273 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 274 Op = AtomicRMWInst::Or; 275 break; 276 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 278 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 280 Op = AtomicRMWInst::Xor; 281 break; 282 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 283 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 284 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 285 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 286 Op = AtomicRMWInst::Min; 287 break; 288 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 290 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 292 Op = AtomicRMWInst::UMin; 293 break; 294 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 296 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 298 Op = AtomicRMWInst::Max; 299 break; 300 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 302 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 304 Op = AtomicRMWInst::UMax; 305 break; 306 } 307 308 const unsigned ValIdx = 0; 309 310 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 311 312 // If the value operand is divergent, each lane is contributing a different 313 // value to the atomic calculation. We can only optimize divergent values if 314 // we have DPP available on our subtarget, and the atomic operation is 32 315 // bits. 316 if (ValDivergent && 317 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 318 return; 319 } 320 321 // If any of the other arguments to the intrinsic are divergent, we can't 322 // optimize the operation. 323 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 324 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 325 return; 326 } 327 } 328 329 // If we get here, we can optimize the atomic using a single wavefront-wide 330 // atomic operation to do the calculation for the entire wavefront, so 331 // remember the instruction so we can come back to it. 332 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 333 334 ToReplace.push_back(Info); 335 } 336 337 // Use the builder to create the non-atomic counterpart of the specified 338 // atomicrmw binary op. 339 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 340 Value *LHS, Value *RHS) { 341 CmpInst::Predicate Pred; 342 343 switch (Op) { 344 default: 345 llvm_unreachable("Unhandled atomic op"); 346 case AtomicRMWInst::Add: 347 return B.CreateBinOp(Instruction::Add, LHS, RHS); 348 case AtomicRMWInst::FAdd: 349 return B.CreateFAdd(LHS, RHS); 350 case AtomicRMWInst::Sub: 351 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 352 case AtomicRMWInst::FSub: 353 return B.CreateFSub(LHS, RHS); 354 case AtomicRMWInst::And: 355 return B.CreateBinOp(Instruction::And, LHS, RHS); 356 case AtomicRMWInst::Or: 357 return B.CreateBinOp(Instruction::Or, LHS, RHS); 358 case AtomicRMWInst::Xor: 359 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 360 361 case AtomicRMWInst::Max: 362 Pred = CmpInst::ICMP_SGT; 363 break; 364 case AtomicRMWInst::Min: 365 Pred = CmpInst::ICMP_SLT; 366 break; 367 case AtomicRMWInst::UMax: 368 Pred = CmpInst::ICMP_UGT; 369 break; 370 case AtomicRMWInst::UMin: 371 Pred = CmpInst::ICMP_ULT; 372 break; 373 case AtomicRMWInst::FMax: 374 return B.CreateMaxNum(LHS, RHS); 375 case AtomicRMWInst::FMin: 376 return B.CreateMinNum(LHS, RHS); 377 } 378 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 379 return B.CreateSelect(Cond, LHS, RHS); 380 } 381 382 // Use the builder to create a reduction of V across the wavefront, with all 383 // lanes active, returning the same result in all lanes. 384 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 385 AtomicRMWInst::BinOp Op, 386 Value *V, 387 Value *const Identity) const { 388 Type *AtomicTy = V->getType(); 389 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 390 Module *M = B.GetInsertBlock()->getModule(); 391 Function *UpdateDPP = 392 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 393 394 // Reduce within each row of 16 lanes. 395 for (unsigned Idx = 0; Idx < 4; Idx++) { 396 V = buildNonAtomicBinOp( 397 B, Op, V, 398 B.CreateCall(UpdateDPP, 399 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 400 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 401 } 402 403 // Reduce within each pair of rows (i.e. 32 lanes). 404 assert(ST->hasPermLaneX16()); 405 V = B.CreateBitCast(V, IntNTy); 406 Value *Permlanex16Call = B.CreateIntrinsic( 407 V->getType(), Intrinsic::amdgcn_permlanex16, 408 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 409 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 410 B.CreateBitCast(Permlanex16Call, AtomicTy)); 411 if (ST->isWave32()) { 412 return V; 413 } 414 415 if (ST->hasPermLane64()) { 416 // Reduce across the upper and lower 32 lanes. 417 V = B.CreateBitCast(V, IntNTy); 418 Value *Permlane64Call = 419 B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); 420 return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 421 B.CreateBitCast(Permlane64Call, AtomicTy)); 422 } 423 424 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 425 // combine them with a scalar operation. 426 Function *ReadLane = 427 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); 428 V = B.CreateBitCast(V, IntNTy); 429 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 430 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 431 return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy), 432 B.CreateBitCast(Lane32, AtomicTy)); 433 } 434 435 // Use the builder to create an inclusive scan of V across the wavefront, with 436 // all lanes active. 437 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 438 AtomicRMWInst::BinOp Op, Value *V, 439 Value *Identity) const { 440 Type *AtomicTy = V->getType(); 441 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 442 443 Module *M = B.GetInsertBlock()->getModule(); 444 Function *UpdateDPP = 445 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 446 447 for (unsigned Idx = 0; Idx < 4; Idx++) { 448 V = buildNonAtomicBinOp( 449 B, Op, V, 450 B.CreateCall(UpdateDPP, 451 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 452 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 453 } 454 if (ST->hasDPPBroadcasts()) { 455 // GFX9 has DPP row broadcast operations. 456 V = buildNonAtomicBinOp( 457 B, Op, V, 458 B.CreateCall(UpdateDPP, 459 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 460 B.getInt32(0xf), B.getFalse()})); 461 V = buildNonAtomicBinOp( 462 B, Op, V, 463 B.CreateCall(UpdateDPP, 464 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 465 B.getInt32(0xf), B.getFalse()})); 466 } else { 467 // On GFX10 all DPP operations are confined to a single row. To get cross- 468 // row operations we have to use permlane or readlane. 469 470 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 471 // 48..63). 472 assert(ST->hasPermLaneX16()); 473 V = B.CreateBitCast(V, IntNTy); 474 Value *PermX = B.CreateIntrinsic( 475 V->getType(), Intrinsic::amdgcn_permlanex16, 476 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 477 478 Value *UpdateDPPCall = 479 B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy), 480 B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa), 481 B.getInt32(0xf), B.getFalse()}); 482 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); 483 484 if (!ST->isWave32()) { 485 // Combine lane 31 into lanes 32..63. 486 V = B.CreateBitCast(V, IntNTy); 487 Value *const Lane31 = B.CreateIntrinsic( 488 V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 489 490 Value *UpdateDPPCall = B.CreateCall( 491 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 492 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 493 494 V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), 495 UpdateDPPCall); 496 } 497 } 498 return V; 499 } 500 501 // Use the builder to create a shift right of V across the wavefront, with all 502 // lanes active, to turn an inclusive scan into an exclusive scan. 503 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 504 Value *Identity) const { 505 Type *AtomicTy = V->getType(); 506 Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); 507 508 Module *M = B.GetInsertBlock()->getModule(); 509 Function *UpdateDPP = 510 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 511 if (ST->hasDPPWavefrontShifts()) { 512 // GFX9 has DPP wavefront shift operations. 513 V = B.CreateCall(UpdateDPP, 514 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 515 B.getInt32(0xf), B.getFalse()}); 516 } else { 517 Function *ReadLane = Intrinsic::getDeclaration( 518 M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); 519 Function *WriteLane = Intrinsic::getDeclaration( 520 M, Intrinsic::amdgcn_writelane, B.getInt32Ty()); 521 522 // On GFX10 all DPP operations are confined to a single row. To get cross- 523 // row operations we have to use permlane or readlane. 524 Value *Old = V; 525 V = B.CreateCall(UpdateDPP, 526 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 527 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 528 529 // Copy the old lane 15 to the new lane 16. 530 V = B.CreateCall( 531 WriteLane, 532 {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}), 533 B.getInt32(16), B.CreateBitCast(V, IntNTy)}); 534 V = B.CreateBitCast(V, AtomicTy); 535 if (!ST->isWave32()) { 536 // Copy the old lane 31 to the new lane 32. 537 V = B.CreateBitCast(V, IntNTy); 538 V = B.CreateCall(WriteLane, 539 {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), 540 B.getInt32(31)}), 541 B.getInt32(32), V}); 542 543 // Copy the old lane 47 to the new lane 48. 544 V = B.CreateCall( 545 WriteLane, 546 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 547 V = B.CreateBitCast(V, AtomicTy); 548 } 549 } 550 551 return V; 552 } 553 554 // Use the builder to create an exclusive scan and compute the final reduced 555 // value using an iterative approach. This provides an alternative 556 // implementation to DPP which uses WMM for scan computations. This API iterate 557 // over active lanes to read, compute and update the value using 558 // readlane and writelane intrinsics. 559 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 560 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 561 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 562 auto *Ty = I.getType(); 563 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 564 auto *EntryBB = I.getParent(); 565 auto NeedResult = !I.use_empty(); 566 567 auto *Ballot = 568 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 569 570 // Start inserting instructions for ComputeLoop block 571 B.SetInsertPoint(ComputeLoop); 572 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 573 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 574 Accumulator->addIncoming(Identity, EntryBB); 575 PHINode *OldValuePhi = nullptr; 576 if (NeedResult) { 577 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 578 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 579 } 580 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 581 ActiveBits->addIncoming(Ballot, EntryBB); 582 583 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 584 auto *FF1 = 585 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 586 587 Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); 588 auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy); 589 590 // Get the value required for atomic operation 591 V = B.CreateBitCast(V, IntNTy); 592 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 593 {V, LaneIdxInt}); 594 LaneValue = B.CreateBitCast(LaneValue, Ty); 595 596 // Perform writelane if intermediate scan results are required later in the 597 // kernel computations 598 Value *OldValue = nullptr; 599 if (NeedResult) { 600 OldValue = 601 B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane, 602 {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt, 603 B.CreateBitCast(OldValuePhi, IntNTy)}); 604 OldValue = B.CreateBitCast(OldValue, Ty); 605 OldValuePhi->addIncoming(OldValue, ComputeLoop); 606 } 607 608 // Accumulate the results 609 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 610 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 611 612 // Set bit to zero of current active lane so that for next iteration llvm.cttz 613 // return the next active lane 614 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 615 616 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 617 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 618 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 619 620 // Branch out of the loop when all lanes are processed. 621 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 622 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 623 624 B.SetInsertPoint(ComputeEnd); 625 626 return {OldValue, NewAccumulator}; 627 } 628 629 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 630 AtomicRMWInst::BinOp Op) { 631 LLVMContext &C = Ty->getContext(); 632 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 633 switch (Op) { 634 default: 635 llvm_unreachable("Unhandled atomic op"); 636 case AtomicRMWInst::Add: 637 case AtomicRMWInst::Sub: 638 case AtomicRMWInst::Or: 639 case AtomicRMWInst::Xor: 640 case AtomicRMWInst::UMax: 641 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 642 case AtomicRMWInst::And: 643 case AtomicRMWInst::UMin: 644 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 645 case AtomicRMWInst::Max: 646 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 647 case AtomicRMWInst::Min: 648 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 649 case AtomicRMWInst::FAdd: 650 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 651 case AtomicRMWInst::FSub: 652 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 653 case AtomicRMWInst::FMin: 654 return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), false)); 655 case AtomicRMWInst::FMax: 656 return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), true)); 657 } 658 } 659 660 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 661 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 662 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 663 } 664 665 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 666 AtomicRMWInst::BinOp Op, 667 unsigned ValIdx, 668 bool ValDivergent) const { 669 // Start building just before the instruction. 670 IRBuilder<> B(&I); 671 672 if (AtomicRMWInst::isFPOperation(Op)) { 673 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 674 } 675 676 // If we are in a pixel shader, because of how we have to mask out helper 677 // lane invocations, we need to record the entry and exit BB's. 678 BasicBlock *PixelEntryBB = nullptr; 679 BasicBlock *PixelExitBB = nullptr; 680 681 // If we're optimizing an atomic within a pixel shader, we need to wrap the 682 // entire atomic operation in a helper-lane check. We do not want any helper 683 // lanes that are around only for the purposes of derivatives to take part 684 // in any cross-lane communication, and we use a branch on whether the lane is 685 // live to do this. 686 if (IsPixelShader) { 687 // Record I's original position as the entry block. 688 PixelEntryBB = I.getParent(); 689 690 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 691 Instruction *const NonHelperTerminator = 692 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 693 694 // Record I's new position as the exit block. 695 PixelExitBB = I.getParent(); 696 697 I.moveBefore(NonHelperTerminator); 698 B.SetInsertPoint(&I); 699 } 700 701 Type *const Ty = I.getType(); 702 Type *Int32Ty = B.getInt32Ty(); 703 Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); 704 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 705 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 706 auto *const VecTy = FixedVectorType::get(Int32Ty, 2); 707 708 // This is the value in the atomic operation we need to combine in order to 709 // reduce the number of atomic operations. 710 Value *V = I.getOperand(ValIdx); 711 712 // We need to know how many lanes are active within the wavefront, and we do 713 // this by doing a ballot of active lanes. 714 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 715 CallInst *const Ballot = 716 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 717 718 // We need to know how many lanes are active within the wavefront that are 719 // below us. If we counted each lane linearly starting from 0, a lane is 720 // below us only if its associated index was less than ours. We do this by 721 // using the mbcnt intrinsic. 722 Value *Mbcnt; 723 if (ST->isWave32()) { 724 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 725 {Ballot, B.getInt32(0)}); 726 } else { 727 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 728 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 729 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 730 {ExtractLo, B.getInt32(0)}); 731 Mbcnt = 732 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 733 } 734 735 Function *F = I.getFunction(); 736 LLVMContext &C = F->getContext(); 737 738 // For atomic sub, perform scan with add operation and allow one lane to 739 // subtract the reduced value later. 740 AtomicRMWInst::BinOp ScanOp = Op; 741 if (Op == AtomicRMWInst::Sub) { 742 ScanOp = AtomicRMWInst::Add; 743 } else if (Op == AtomicRMWInst::FSub) { 744 ScanOp = AtomicRMWInst::FAdd; 745 } 746 Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 747 748 Value *ExclScan = nullptr; 749 Value *NewV = nullptr; 750 751 const bool NeedResult = !I.use_empty(); 752 753 BasicBlock *ComputeLoop = nullptr; 754 BasicBlock *ComputeEnd = nullptr; 755 // If we have a divergent value in each lane, we need to combine the value 756 // using DPP. 757 if (ValDivergent) { 758 if (ScanImpl == ScanOptions::DPP) { 759 // First we need to set all inactive invocations to the identity value, so 760 // that they can correctly contribute to the final result. 761 V = B.CreateBitCast(V, IntNTy); 762 Identity = B.CreateBitCast(Identity, IntNTy); 763 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy, 764 {V, Identity}); 765 NewV = B.CreateBitCast(NewV, Ty); 766 V = B.CreateBitCast(V, Ty); 767 Identity = B.CreateBitCast(Identity, Ty); 768 if (!NeedResult && ST->hasPermLaneX16()) { 769 // On GFX10 the permlanex16 instruction helps us build a reduction 770 // without too many readlanes and writelanes, which are generally bad 771 // for performance. 772 NewV = buildReduction(B, ScanOp, NewV, Identity); 773 } else { 774 NewV = buildScan(B, ScanOp, NewV, Identity); 775 if (NeedResult) 776 ExclScan = buildShiftRight(B, NewV, Identity); 777 // Read the value from the last lane, which has accumulated the values 778 // of each active lane in the wavefront. This will be our new value 779 // which we will provide to the atomic operation. 780 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 781 assert(TyBitWidth == 32); 782 NewV = B.CreateBitCast(NewV, IntNTy); 783 NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane, 784 {NewV, LastLaneIdx}); 785 NewV = B.CreateBitCast(NewV, Ty); 786 } 787 // Finally mark the readlanes in the WWM section. 788 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 789 } else if (ScanImpl == ScanOptions::Iterative) { 790 // Alternative implementation for scan 791 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 792 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 793 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 794 ComputeLoop, ComputeEnd); 795 } else { 796 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 797 } 798 } else { 799 switch (Op) { 800 default: 801 llvm_unreachable("Unhandled atomic op"); 802 803 case AtomicRMWInst::Add: 804 case AtomicRMWInst::Sub: { 805 // The new value we will be contributing to the atomic operation is the 806 // old value times the number of active lanes. 807 Value *const Ctpop = B.CreateIntCast( 808 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 809 NewV = buildMul(B, V, Ctpop); 810 break; 811 } 812 case AtomicRMWInst::FAdd: 813 case AtomicRMWInst::FSub: { 814 Value *const Ctpop = B.CreateIntCast( 815 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 816 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 817 NewV = B.CreateFMul(V, CtpopFP); 818 break; 819 } 820 case AtomicRMWInst::And: 821 case AtomicRMWInst::Or: 822 case AtomicRMWInst::Max: 823 case AtomicRMWInst::Min: 824 case AtomicRMWInst::UMax: 825 case AtomicRMWInst::UMin: 826 case AtomicRMWInst::FMin: 827 case AtomicRMWInst::FMax: 828 // These operations with a uniform value are idempotent: doing the atomic 829 // operation multiple times has the same effect as doing it once. 830 NewV = V; 831 break; 832 833 case AtomicRMWInst::Xor: 834 // The new value we will be contributing to the atomic operation is the 835 // old value times the parity of the number of active lanes. 836 Value *const Ctpop = B.CreateIntCast( 837 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 838 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 839 break; 840 } 841 } 842 843 // We only want a single lane to enter our new control flow, and we do this 844 // by checking if there are any active lanes below us. Only one lane will 845 // have 0 active lanes below us, so that will be the only one to progress. 846 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 847 848 // Store I's original basic block before we split the block. 849 BasicBlock *const OriginalBB = I.getParent(); 850 851 // We need to introduce some new control flow to force a single lane to be 852 // active. We do this by splitting I's basic block at I, and introducing the 853 // new block such that: 854 // entry --> single_lane -\ 855 // \------------------> exit 856 Instruction *const SingleLaneTerminator = 857 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 858 859 // At this point, we have split the I's block to allow one lane in wavefront 860 // to update the precomputed reduced value. Also, completed the codegen for 861 // new control flow i.e. iterative loop which perform reduction and scan using 862 // ComputeLoop and ComputeEnd. 863 // For the new control flow, we need to move branch instruction i.e. 864 // terminator created during SplitBlockAndInsertIfThen from I's block to 865 // ComputeEnd block. We also need to set up predecessor to next block when 866 // single lane done updating the final reduced value. 867 BasicBlock *Predecessor = nullptr; 868 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 869 // Move terminator from I's block to ComputeEnd block. 870 // 871 // OriginalBB is known to have a branch as terminator because 872 // SplitBlockAndInsertIfThen will have inserted one. 873 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 874 B.SetInsertPoint(ComputeEnd); 875 Terminator->removeFromParent(); 876 B.Insert(Terminator); 877 878 // Branch to ComputeLoop Block unconditionally from the I's block for 879 // iterative approach. 880 B.SetInsertPoint(OriginalBB); 881 B.CreateBr(ComputeLoop); 882 883 // Update the dominator tree for new control flow. 884 SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 885 {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 886 {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 887 888 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 889 // the DT edges as well. 890 for (auto *Succ : Terminator->successors()) { 891 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 892 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 893 } 894 895 DTU.applyUpdates(DomTreeUpdates); 896 897 Predecessor = ComputeEnd; 898 } else { 899 Predecessor = OriginalBB; 900 } 901 // Move the IR builder into single_lane next. 902 B.SetInsertPoint(SingleLaneTerminator); 903 904 // Clone the original atomic operation into single lane, replacing the 905 // original value with our newly created one. 906 Instruction *const NewI = I.clone(); 907 B.Insert(NewI); 908 NewI->setOperand(ValIdx, NewV); 909 910 // Move the IR builder into exit next, and start inserting just before the 911 // original instruction. 912 B.SetInsertPoint(&I); 913 914 if (NeedResult) { 915 // Create a PHI node to get our new atomic result into the exit block. 916 PHINode *const PHI = B.CreatePHI(Ty, 2); 917 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 918 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 919 920 // We need to broadcast the value who was the lowest active lane (the first 921 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 922 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 923 Value *BroadcastI = nullptr; 924 925 if (TyBitWidth == 64) { 926 Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); 927 Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty); 928 Value *const ExtractHi = 929 B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty); 930 CallInst *const ReadFirstLaneLo = B.CreateIntrinsic( 931 Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo); 932 CallInst *const ReadFirstLaneHi = B.CreateIntrinsic( 933 Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi); 934 Value *const PartialInsert = B.CreateInsertElement( 935 PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 936 Value *const Insert = 937 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 938 BroadcastI = B.CreateBitCast(Insert, Ty); 939 } else if (TyBitWidth == 32) { 940 Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); 941 BroadcastI = 942 B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi); 943 BroadcastI = B.CreateBitCast(BroadcastI, Ty); 944 945 } else { 946 llvm_unreachable("Unhandled atomic bit width"); 947 } 948 949 // Now that we have the result of our single atomic operation, we need to 950 // get our individual lane's slice into the result. We use the lane offset 951 // we previously calculated combined with the atomic result value we got 952 // from the first lane, to get our lane's index into the atomic result. 953 Value *LaneOffset = nullptr; 954 if (ValDivergent) { 955 if (ScanImpl == ScanOptions::DPP) { 956 LaneOffset = 957 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 958 } else if (ScanImpl == ScanOptions::Iterative) { 959 LaneOffset = ExclScan; 960 } else { 961 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 962 } 963 } else { 964 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 965 : B.CreateIntCast(Mbcnt, Ty, false); 966 switch (Op) { 967 default: 968 llvm_unreachable("Unhandled atomic op"); 969 case AtomicRMWInst::Add: 970 case AtomicRMWInst::Sub: 971 LaneOffset = buildMul(B, V, Mbcnt); 972 break; 973 case AtomicRMWInst::And: 974 case AtomicRMWInst::Or: 975 case AtomicRMWInst::Max: 976 case AtomicRMWInst::Min: 977 case AtomicRMWInst::UMax: 978 case AtomicRMWInst::UMin: 979 case AtomicRMWInst::FMin: 980 case AtomicRMWInst::FMax: 981 LaneOffset = B.CreateSelect(Cond, Identity, V); 982 break; 983 case AtomicRMWInst::Xor: 984 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 985 break; 986 case AtomicRMWInst::FAdd: 987 case AtomicRMWInst::FSub: { 988 LaneOffset = B.CreateFMul(V, Mbcnt); 989 break; 990 } 991 } 992 } 993 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 994 995 if (IsPixelShader) { 996 // Need a final PHI to reconverge to above the helper lane branch mask. 997 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 998 999 PHINode *const PHI = B.CreatePHI(Ty, 2); 1000 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 1001 PHI->addIncoming(Result, I.getParent()); 1002 I.replaceAllUsesWith(PHI); 1003 } else { 1004 // Replace the original atomic instruction with the new one. 1005 I.replaceAllUsesWith(Result); 1006 } 1007 } 1008 1009 // And delete the original. 1010 I.eraseFromParent(); 1011 } 1012 1013 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 1014 "AMDGPU atomic optimizations", false, false) 1015 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 1016 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 1017 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 1018 "AMDGPU atomic optimizations", false, false) 1019 1020 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 1021 return new AMDGPUAtomicOptimizer(ScanStrategy); 1022 } 1023