xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (revision 21a69bdb66e3d77b066a06dd4d4c3ff8a2dd8daa)
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "llvm/Analysis/UniformityAnalysis.h"
19 #include "llvm/CodeGen/TargetPassConfig.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
22 #include "llvm/IR/IntrinsicsAMDGPU.h"
23 #include "llvm/InitializePasses.h"
24 #include "llvm/Target/TargetMachine.h"
25 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
26 
27 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 namespace {
33 
34 struct ReplacementInfo {
35   Instruction *I;
36   AtomicRMWInst::BinOp Op;
37   unsigned ValIdx;
38   bool ValDivergent;
39 };
40 
41 class AMDGPUAtomicOptimizer : public FunctionPass {
42 public:
43   static char ID;
44 
45   AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
46 
47   bool runOnFunction(Function &F) override;
48 
49   void getAnalysisUsage(AnalysisUsage &AU) const override {
50     AU.addPreserved<DominatorTreeWrapperPass>();
51     AU.addRequired<UniformityInfoWrapperPass>();
52     AU.addRequired<TargetPassConfig>();
53   }
54 };
55 
56 class AMDGPUAtomicOptimizerImpl
57     : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
58 private:
59   SmallVector<ReplacementInfo, 8> ToReplace;
60   const UniformityInfo *UA;
61   const DataLayout *DL;
62   DominatorTree *DT;
63   const GCNSubtarget *ST;
64   bool IsPixelShader;
65 
66   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
67                         Value *const Identity) const;
68   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
69                    Value *const Identity) const;
70   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
71 
72   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
73                       bool ValDivergent) const;
74 
75 public:
76   AMDGPUAtomicOptimizerImpl() = delete;
77 
78   AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
79                             DominatorTree *DT, const GCNSubtarget *ST,
80                             bool IsPixelShader)
81       : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {}
82 
83   bool run(Function &F);
84 
85   void visitAtomicRMWInst(AtomicRMWInst &I);
86   void visitIntrinsicInst(IntrinsicInst &I);
87 };
88 
89 } // namespace
90 
91 char AMDGPUAtomicOptimizer::ID = 0;
92 
93 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
94 
95 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
96   if (skipFunction(F)) {
97     return false;
98   }
99 
100   const UniformityInfo *UA =
101       &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
102   const DataLayout *DL = &F.getParent()->getDataLayout();
103 
104   DominatorTreeWrapperPass *const DTW =
105       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
106   DominatorTree *DT = DTW ? &DTW->getDomTree() : nullptr;
107 
108   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
109   const TargetMachine &TM = TPC.getTM<TargetMachine>();
110   const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
111 
112   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
113 
114   return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F);
115 }
116 
117 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
118                                                  FunctionAnalysisManager &AM) {
119 
120   const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
121   const DataLayout *DL = &F.getParent()->getDataLayout();
122 
123   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
124   const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
125 
126   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
127 
128   return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F)
129              ? PreservedAnalyses::none()
130              : PreservedAnalyses::all();
131 }
132 
133 bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
134 
135   visit(F);
136 
137   const bool Changed = !ToReplace.empty();
138 
139   for (ReplacementInfo &Info : ToReplace) {
140     optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
141   }
142 
143   ToReplace.clear();
144 
145   return Changed;
146 }
147 
148 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
149   // Early exit for unhandled address space atomic instructions.
150   switch (I.getPointerAddressSpace()) {
151   default:
152     return;
153   case AMDGPUAS::GLOBAL_ADDRESS:
154   case AMDGPUAS::LOCAL_ADDRESS:
155     break;
156   }
157 
158   AtomicRMWInst::BinOp Op = I.getOperation();
159 
160   switch (Op) {
161   default:
162     return;
163   case AtomicRMWInst::Add:
164   case AtomicRMWInst::Sub:
165   case AtomicRMWInst::And:
166   case AtomicRMWInst::Or:
167   case AtomicRMWInst::Xor:
168   case AtomicRMWInst::Max:
169   case AtomicRMWInst::Min:
170   case AtomicRMWInst::UMax:
171   case AtomicRMWInst::UMin:
172     break;
173   }
174 
175   const unsigned PtrIdx = 0;
176   const unsigned ValIdx = 1;
177 
178   // If the pointer operand is divergent, then each lane is doing an atomic
179   // operation on a different address, and we cannot optimize that.
180   if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
181     return;
182   }
183 
184   const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
185 
186   // If the value operand is divergent, each lane is contributing a different
187   // value to the atomic calculation. We can only optimize divergent values if
188   // we have DPP available on our subtarget, and the atomic operation is 32
189   // bits.
190   if (ValDivergent &&
191       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
192     return;
193   }
194 
195   // If we get here, we can optimize the atomic using a single wavefront-wide
196   // atomic operation to do the calculation for the entire wavefront, so
197   // remember the instruction so we can come back to it.
198   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
199 
200   ToReplace.push_back(Info);
201 }
202 
203 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
204   AtomicRMWInst::BinOp Op;
205 
206   switch (I.getIntrinsicID()) {
207   default:
208     return;
209   case Intrinsic::amdgcn_buffer_atomic_add:
210   case Intrinsic::amdgcn_struct_buffer_atomic_add:
211   case Intrinsic::amdgcn_raw_buffer_atomic_add:
212     Op = AtomicRMWInst::Add;
213     break;
214   case Intrinsic::amdgcn_buffer_atomic_sub:
215   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
216   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
217     Op = AtomicRMWInst::Sub;
218     break;
219   case Intrinsic::amdgcn_buffer_atomic_and:
220   case Intrinsic::amdgcn_struct_buffer_atomic_and:
221   case Intrinsic::amdgcn_raw_buffer_atomic_and:
222     Op = AtomicRMWInst::And;
223     break;
224   case Intrinsic::amdgcn_buffer_atomic_or:
225   case Intrinsic::amdgcn_struct_buffer_atomic_or:
226   case Intrinsic::amdgcn_raw_buffer_atomic_or:
227     Op = AtomicRMWInst::Or;
228     break;
229   case Intrinsic::amdgcn_buffer_atomic_xor:
230   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
231   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
232     Op = AtomicRMWInst::Xor;
233     break;
234   case Intrinsic::amdgcn_buffer_atomic_smin:
235   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
236   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
237     Op = AtomicRMWInst::Min;
238     break;
239   case Intrinsic::amdgcn_buffer_atomic_umin:
240   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
241   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
242     Op = AtomicRMWInst::UMin;
243     break;
244   case Intrinsic::amdgcn_buffer_atomic_smax:
245   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
246   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
247     Op = AtomicRMWInst::Max;
248     break;
249   case Intrinsic::amdgcn_buffer_atomic_umax:
250   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
251   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
252     Op = AtomicRMWInst::UMax;
253     break;
254   }
255 
256   const unsigned ValIdx = 0;
257 
258   const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
259 
260   // If the value operand is divergent, each lane is contributing a different
261   // value to the atomic calculation. We can only optimize divergent values if
262   // we have DPP available on our subtarget, and the atomic operation is 32
263   // bits.
264   if (ValDivergent &&
265       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
266     return;
267   }
268 
269   // If any of the other arguments to the intrinsic are divergent, we can't
270   // optimize the operation.
271   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
272     if (UA->isDivergentUse(I.getOperandUse(Idx))) {
273       return;
274     }
275   }
276 
277   // If we get here, we can optimize the atomic using a single wavefront-wide
278   // atomic operation to do the calculation for the entire wavefront, so
279   // remember the instruction so we can come back to it.
280   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
281 
282   ToReplace.push_back(Info);
283 }
284 
285 // Use the builder to create the non-atomic counterpart of the specified
286 // atomicrmw binary op.
287 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
288                                   Value *LHS, Value *RHS) {
289   CmpInst::Predicate Pred;
290 
291   switch (Op) {
292   default:
293     llvm_unreachable("Unhandled atomic op");
294   case AtomicRMWInst::Add:
295     return B.CreateBinOp(Instruction::Add, LHS, RHS);
296   case AtomicRMWInst::Sub:
297     return B.CreateBinOp(Instruction::Sub, LHS, RHS);
298   case AtomicRMWInst::And:
299     return B.CreateBinOp(Instruction::And, LHS, RHS);
300   case AtomicRMWInst::Or:
301     return B.CreateBinOp(Instruction::Or, LHS, RHS);
302   case AtomicRMWInst::Xor:
303     return B.CreateBinOp(Instruction::Xor, LHS, RHS);
304 
305   case AtomicRMWInst::Max:
306     Pred = CmpInst::ICMP_SGT;
307     break;
308   case AtomicRMWInst::Min:
309     Pred = CmpInst::ICMP_SLT;
310     break;
311   case AtomicRMWInst::UMax:
312     Pred = CmpInst::ICMP_UGT;
313     break;
314   case AtomicRMWInst::UMin:
315     Pred = CmpInst::ICMP_ULT;
316     break;
317   }
318   Value *Cond = B.CreateICmp(Pred, LHS, RHS);
319   return B.CreateSelect(Cond, LHS, RHS);
320 }
321 
322 // Use the builder to create a reduction of V across the wavefront, with all
323 // lanes active, returning the same result in all lanes.
324 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
325                                                  AtomicRMWInst::BinOp Op,
326                                                  Value *V,
327                                                  Value *const Identity) const {
328   Type *const Ty = V->getType();
329   Module *M = B.GetInsertBlock()->getModule();
330   Function *UpdateDPP =
331       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
332 
333   // Reduce within each row of 16 lanes.
334   for (unsigned Idx = 0; Idx < 4; Idx++) {
335     V = buildNonAtomicBinOp(
336         B, Op, V,
337         B.CreateCall(UpdateDPP,
338                      {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
339                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
340   }
341 
342   // Reduce within each pair of rows (i.e. 32 lanes).
343   assert(ST->hasPermLaneX16());
344   V = buildNonAtomicBinOp(
345       B, Op, V,
346       B.CreateIntrinsic(
347           Intrinsic::amdgcn_permlanex16, {},
348           {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
349 
350   if (ST->isWave32())
351     return V;
352 
353   if (ST->hasPermLane64()) {
354     // Reduce across the upper and lower 32 lanes.
355     return buildNonAtomicBinOp(
356         B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
357   }
358 
359   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
360   // combine them with a scalar operation.
361   Function *ReadLane =
362       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
363   Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
364   Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
365   return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
366 }
367 
368 // Use the builder to create an inclusive scan of V across the wavefront, with
369 // all lanes active.
370 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
371                                             AtomicRMWInst::BinOp Op, Value *V,
372                                             Value *const Identity) const {
373   Type *const Ty = V->getType();
374   Module *M = B.GetInsertBlock()->getModule();
375   Function *UpdateDPP =
376       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
377 
378   for (unsigned Idx = 0; Idx < 4; Idx++) {
379     V = buildNonAtomicBinOp(
380         B, Op, V,
381         B.CreateCall(UpdateDPP,
382                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
383                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
384   }
385   if (ST->hasDPPBroadcasts()) {
386     // GFX9 has DPP row broadcast operations.
387     V = buildNonAtomicBinOp(
388         B, Op, V,
389         B.CreateCall(UpdateDPP,
390                      {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
391                       B.getInt32(0xf), B.getFalse()}));
392     V = buildNonAtomicBinOp(
393         B, Op, V,
394         B.CreateCall(UpdateDPP,
395                      {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
396                       B.getInt32(0xf), B.getFalse()}));
397   } else {
398     // On GFX10 all DPP operations are confined to a single row. To get cross-
399     // row operations we have to use permlane or readlane.
400 
401     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
402     // 48..63).
403     assert(ST->hasPermLaneX16());
404     Value *const PermX = B.CreateIntrinsic(
405         Intrinsic::amdgcn_permlanex16, {},
406         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
407     V = buildNonAtomicBinOp(
408         B, Op, V,
409         B.CreateCall(UpdateDPP,
410                      {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
411                       B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
412     if (!ST->isWave32()) {
413       // Combine lane 31 into lanes 32..63.
414       Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
415                                               {V, B.getInt32(31)});
416       V = buildNonAtomicBinOp(
417           B, Op, V,
418           B.CreateCall(UpdateDPP,
419                        {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
420                         B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
421     }
422   }
423   return V;
424 }
425 
426 // Use the builder to create a shift right of V across the wavefront, with all
427 // lanes active, to turn an inclusive scan into an exclusive scan.
428 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
429                                                   Value *const Identity) const {
430   Type *const Ty = V->getType();
431   Module *M = B.GetInsertBlock()->getModule();
432   Function *UpdateDPP =
433       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
434 
435   if (ST->hasDPPWavefrontShifts()) {
436     // GFX9 has DPP wavefront shift operations.
437     V = B.CreateCall(UpdateDPP,
438                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
439                       B.getInt32(0xf), B.getFalse()});
440   } else {
441     Function *ReadLane =
442         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
443     Function *WriteLane =
444         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
445 
446     // On GFX10 all DPP operations are confined to a single row. To get cross-
447     // row operations we have to use permlane or readlane.
448     Value *Old = V;
449     V = B.CreateCall(UpdateDPP,
450                      {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
451                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
452 
453     // Copy the old lane 15 to the new lane 16.
454     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
455                                  B.getInt32(16), V});
456 
457     if (!ST->isWave32()) {
458       // Copy the old lane 31 to the new lane 32.
459       V = B.CreateCall(
460           WriteLane,
461           {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
462 
463       // Copy the old lane 47 to the new lane 48.
464       V = B.CreateCall(
465           WriteLane,
466           {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
467     }
468   }
469 
470   return V;
471 }
472 
473 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
474                                          unsigned BitWidth) {
475   switch (Op) {
476   default:
477     llvm_unreachable("Unhandled atomic op");
478   case AtomicRMWInst::Add:
479   case AtomicRMWInst::Sub:
480   case AtomicRMWInst::Or:
481   case AtomicRMWInst::Xor:
482   case AtomicRMWInst::UMax:
483     return APInt::getMinValue(BitWidth);
484   case AtomicRMWInst::And:
485   case AtomicRMWInst::UMin:
486     return APInt::getMaxValue(BitWidth);
487   case AtomicRMWInst::Max:
488     return APInt::getSignedMinValue(BitWidth);
489   case AtomicRMWInst::Min:
490     return APInt::getSignedMaxValue(BitWidth);
491   }
492 }
493 
494 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
495   const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
496   return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
497 }
498 
499 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
500                                                AtomicRMWInst::BinOp Op,
501                                                unsigned ValIdx,
502                                                bool ValDivergent) const {
503   // Start building just before the instruction.
504   IRBuilder<> B(&I);
505 
506   // If we are in a pixel shader, because of how we have to mask out helper
507   // lane invocations, we need to record the entry and exit BB's.
508   BasicBlock *PixelEntryBB = nullptr;
509   BasicBlock *PixelExitBB = nullptr;
510 
511   // If we're optimizing an atomic within a pixel shader, we need to wrap the
512   // entire atomic operation in a helper-lane check. We do not want any helper
513   // lanes that are around only for the purposes of derivatives to take part
514   // in any cross-lane communication, and we use a branch on whether the lane is
515   // live to do this.
516   if (IsPixelShader) {
517     // Record I's original position as the entry block.
518     PixelEntryBB = I.getParent();
519 
520     Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
521     Instruction *const NonHelperTerminator =
522         SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
523 
524     // Record I's new position as the exit block.
525     PixelExitBB = I.getParent();
526 
527     I.moveBefore(NonHelperTerminator);
528     B.SetInsertPoint(&I);
529   }
530 
531   Type *const Ty = I.getType();
532   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
533   auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
534 
535   // This is the value in the atomic operation we need to combine in order to
536   // reduce the number of atomic operations.
537   Value *const V = I.getOperand(ValIdx);
538 
539   // We need to know how many lanes are active within the wavefront, and we do
540   // this by doing a ballot of active lanes.
541   Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
542   CallInst *const Ballot =
543       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
544 
545   // We need to know how many lanes are active within the wavefront that are
546   // below us. If we counted each lane linearly starting from 0, a lane is
547   // below us only if its associated index was less than ours. We do this by
548   // using the mbcnt intrinsic.
549   Value *Mbcnt;
550   if (ST->isWave32()) {
551     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
552                               {Ballot, B.getInt32(0)});
553   } else {
554     Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
555     Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
556     Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
557     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
558                               {ExtractLo, B.getInt32(0)});
559     Mbcnt =
560         B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
561   }
562   Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
563 
564   Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
565 
566   Value *ExclScan = nullptr;
567   Value *NewV = nullptr;
568 
569   const bool NeedResult = !I.use_empty();
570 
571   // If we have a divergent value in each lane, we need to combine the value
572   // using DPP.
573   if (ValDivergent) {
574     // First we need to set all inactive invocations to the identity value, so
575     // that they can correctly contribute to the final result.
576     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
577 
578     const AtomicRMWInst::BinOp ScanOp =
579         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
580     if (!NeedResult && ST->hasPermLaneX16()) {
581       // On GFX10 the permlanex16 instruction helps us build a reduction without
582       // too many readlanes and writelanes, which are generally bad for
583       // performance.
584       NewV = buildReduction(B, ScanOp, NewV, Identity);
585     } else {
586       NewV = buildScan(B, ScanOp, NewV, Identity);
587       if (NeedResult)
588         ExclScan = buildShiftRight(B, NewV, Identity);
589 
590       // Read the value from the last lane, which has accumulated the values of
591       // each active lane in the wavefront. This will be our new value which we
592       // will provide to the atomic operation.
593       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
594       assert(TyBitWidth == 32);
595       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
596                                {NewV, LastLaneIdx});
597     }
598 
599     // Finally mark the readlanes in the WWM section.
600     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
601   } else {
602     switch (Op) {
603     default:
604       llvm_unreachable("Unhandled atomic op");
605 
606     case AtomicRMWInst::Add:
607     case AtomicRMWInst::Sub: {
608       // The new value we will be contributing to the atomic operation is the
609       // old value times the number of active lanes.
610       Value *const Ctpop = B.CreateIntCast(
611           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
612       NewV = buildMul(B, V, Ctpop);
613       break;
614     }
615 
616     case AtomicRMWInst::And:
617     case AtomicRMWInst::Or:
618     case AtomicRMWInst::Max:
619     case AtomicRMWInst::Min:
620     case AtomicRMWInst::UMax:
621     case AtomicRMWInst::UMin:
622       // These operations with a uniform value are idempotent: doing the atomic
623       // operation multiple times has the same effect as doing it once.
624       NewV = V;
625       break;
626 
627     case AtomicRMWInst::Xor:
628       // The new value we will be contributing to the atomic operation is the
629       // old value times the parity of the number of active lanes.
630       Value *const Ctpop = B.CreateIntCast(
631           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
632       NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
633       break;
634     }
635   }
636 
637   // We only want a single lane to enter our new control flow, and we do this
638   // by checking if there are any active lanes below us. Only one lane will
639   // have 0 active lanes below us, so that will be the only one to progress.
640   Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
641 
642   // Store I's original basic block before we split the block.
643   BasicBlock *const EntryBB = I.getParent();
644 
645   // We need to introduce some new control flow to force a single lane to be
646   // active. We do this by splitting I's basic block at I, and introducing the
647   // new block such that:
648   // entry --> single_lane -\
649   //       \------------------> exit
650   Instruction *const SingleLaneTerminator =
651       SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
652 
653   // Move the IR builder into single_lane next.
654   B.SetInsertPoint(SingleLaneTerminator);
655 
656   // Clone the original atomic operation into single lane, replacing the
657   // original value with our newly created one.
658   Instruction *const NewI = I.clone();
659   B.Insert(NewI);
660   NewI->setOperand(ValIdx, NewV);
661 
662   // Move the IR builder into exit next, and start inserting just before the
663   // original instruction.
664   B.SetInsertPoint(&I);
665 
666   if (NeedResult) {
667     // Create a PHI node to get our new atomic result into the exit block.
668     PHINode *const PHI = B.CreatePHI(Ty, 2);
669     PHI->addIncoming(PoisonValue::get(Ty), EntryBB);
670     PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
671 
672     // We need to broadcast the value who was the lowest active lane (the first
673     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
674     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
675     Value *BroadcastI = nullptr;
676 
677     if (TyBitWidth == 64) {
678       Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
679       Value *const ExtractHi =
680           B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
681       CallInst *const ReadFirstLaneLo =
682           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
683       CallInst *const ReadFirstLaneHi =
684           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
685       Value *const PartialInsert = B.CreateInsertElement(
686           PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
687       Value *const Insert =
688           B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
689       BroadcastI = B.CreateBitCast(Insert, Ty);
690     } else if (TyBitWidth == 32) {
691 
692       BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
693     } else {
694       llvm_unreachable("Unhandled atomic bit width");
695     }
696 
697     // Now that we have the result of our single atomic operation, we need to
698     // get our individual lane's slice into the result. We use the lane offset
699     // we previously calculated combined with the atomic result value we got
700     // from the first lane, to get our lane's index into the atomic result.
701     Value *LaneOffset = nullptr;
702     if (ValDivergent) {
703       LaneOffset =
704           B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
705     } else {
706       switch (Op) {
707       default:
708         llvm_unreachable("Unhandled atomic op");
709       case AtomicRMWInst::Add:
710       case AtomicRMWInst::Sub:
711         LaneOffset = buildMul(B, V, Mbcnt);
712         break;
713       case AtomicRMWInst::And:
714       case AtomicRMWInst::Or:
715       case AtomicRMWInst::Max:
716       case AtomicRMWInst::Min:
717       case AtomicRMWInst::UMax:
718       case AtomicRMWInst::UMin:
719         LaneOffset = B.CreateSelect(Cond, Identity, V);
720         break;
721       case AtomicRMWInst::Xor:
722         LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
723         break;
724       }
725     }
726     Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
727 
728     if (IsPixelShader) {
729       // Need a final PHI to reconverge to above the helper lane branch mask.
730       B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
731 
732       PHINode *const PHI = B.CreatePHI(Ty, 2);
733       PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
734       PHI->addIncoming(Result, I.getParent());
735       I.replaceAllUsesWith(PHI);
736     } else {
737       // Replace the original atomic instruction with the new one.
738       I.replaceAllUsesWith(Result);
739     }
740   }
741 
742   // And delete the original.
743   I.eraseFromParent();
744 }
745 
746 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
747                       "AMDGPU atomic optimizations", false, false)
748 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
749 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
750 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
751                     "AMDGPU atomic optimizations", false, false)
752 
753 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
754   return new AMDGPUAtomicOptimizer();
755 }
756