xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp (revision 85c17e40926132575d1b98ca1a36b8394fe511cd)
1d85d143aSJay Foad //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2d85d143aSJay Foad //
3d85d143aSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4d85d143aSJay Foad // See https://llvm.org/LICENSE.txt for license information.
5d85d143aSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6d85d143aSJay Foad //
7d85d143aSJay Foad //===----------------------------------------------------------------------===//
8d85d143aSJay Foad //
9d85d143aSJay Foad // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10d85d143aSJay Foad // or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11d85d143aSJay Foad //
12d85d143aSJay Foad // - they refer to the same vaddr except for sample_id,
13d85d143aSJay Foad // - they use a constant sample_id and they fall into the same group,
14d85d143aSJay Foad // - they have the same dmask and the number of intrinsics and the number of
15d85d143aSJay Foad //   vaddr/vdata dword transfers is reduced by the combine.
16d85d143aSJay Foad //
17d85d143aSJay Foad // Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18d85d143aSJay Foad //
19d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
20d85d143aSJay Foad // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21d85d143aSJay Foad // |  (dmask) |     |     |       | vdata   |            | vdata   |          |
22d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
23d85d143aSJay Foad // |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
24d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
25d85d143aSJay Foad // |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
26d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
27d85d143aSJay Foad // |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
28d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
29d85d143aSJay Foad // |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
30d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
31d85d143aSJay Foad // |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
32d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+
33d85d143aSJay Foad //
34d85d143aSJay Foad // Some cases are of questionable benefit, like the one marked with "yes?"
35d85d143aSJay Foad // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36d85d143aSJay Foad // and TX, but higher vdata. We start by erring on the side of converting these
37d85d143aSJay Foad // to MSAA_LOAD.
38d85d143aSJay Foad //
39d85d143aSJay Foad // clang-format off
40d85d143aSJay Foad //
41d85d143aSJay Foad // This pass will combine intrinsics such as (not neccessarily consecutive):
42d85d143aSJay Foad //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43d85d143aSJay Foad //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44d85d143aSJay Foad //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45d85d143aSJay Foad //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46d85d143aSJay Foad // ==>
47d85d143aSJay Foad //  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48d85d143aSJay Foad //
49d85d143aSJay Foad // clang-format on
50d85d143aSJay Foad //
51d85d143aSJay Foad // Future improvements:
52d85d143aSJay Foad //
53d85d143aSJay Foad // - We may occasionally not want to do the combine if it increases the maximum
54d85d143aSJay Foad //   register pressure.
55d85d143aSJay Foad //
56d85d143aSJay Foad // - Ensure clausing when multiple MSAA_LOAD are generated.
57d85d143aSJay Foad //
58d85d143aSJay Foad // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59d85d143aSJay Foad // combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60d85d143aSJay Foad // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61d85d143aSJay Foad // we don't know the format at compile time.
62d85d143aSJay Foad //===----------------------------------------------------------------------===//
63d85d143aSJay Foad 
64d85d143aSJay Foad #include "AMDGPU.h"
65d85d143aSJay Foad #include "AMDGPUInstrInfo.h"
66d85d143aSJay Foad #include "AMDGPUTargetMachine.h"
67d85d143aSJay Foad #include "llvm/IR/Function.h"
68d85d143aSJay Foad #include "llvm/IR/IRBuilder.h"
69d85d143aSJay Foad #include "llvm/IR/IntrinsicInst.h"
70d85d143aSJay Foad #include "llvm/IR/IntrinsicsAMDGPU.h"
71d85d143aSJay Foad #include "llvm/Pass.h"
72d85d143aSJay Foad #include "llvm/Support/raw_ostream.h"
73d85d143aSJay Foad 
74d85d143aSJay Foad using namespace llvm;
75d85d143aSJay Foad 
76d85d143aSJay Foad #define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77d85d143aSJay Foad 
78d85d143aSJay Foad namespace {
79d85d143aSJay Foad class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80d85d143aSJay Foad   const TargetMachine *TM;
81d85d143aSJay Foad 
82d85d143aSJay Foad public:
83d85d143aSJay Foad   static char ID;
84d85d143aSJay Foad 
85d85d143aSJay Foad   AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86d85d143aSJay Foad       : FunctionPass(ID), TM(TM) {}
87d85d143aSJay Foad 
88d85d143aSJay Foad   bool runOnFunction(Function &F) override;
89d85d143aSJay Foad 
90d85d143aSJay Foad }; // End of class AMDGPUImageIntrinsicOptimizer
91d85d143aSJay Foad } // End anonymous namespace
92d85d143aSJay Foad 
93d85d143aSJay Foad INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94d85d143aSJay Foad                 "AMDGPU Image Intrinsic Optimizer", false, false)
95d85d143aSJay Foad 
96d85d143aSJay Foad char AMDGPUImageIntrinsicOptimizer::ID = 0;
97d85d143aSJay Foad 
98d85d143aSJay Foad void addInstToMergeableList(
99d85d143aSJay Foad     IntrinsicInst *II,
100d85d143aSJay Foad     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101d85d143aSJay Foad     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102d85d143aSJay Foad   for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103d85d143aSJay Foad     // Check Dim.
104d85d143aSJay Foad     if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105d85d143aSJay Foad       continue;
106d85d143aSJay Foad 
107d85d143aSJay Foad     // Check D16.
108d85d143aSJay Foad     if (IIList.front()->getType() != II->getType())
109d85d143aSJay Foad       continue;
110d85d143aSJay Foad 
111104db260SJay Foad     // Check all arguments (DMask, VAddr, RSrc etc).
112104db260SJay Foad     bool AllEqual = true;
113104db260SJay Foad     assert(IIList.front()->arg_size() == II->arg_size());
114104db260SJay Foad     for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115104db260SJay Foad       Value *ArgList = IIList.front()->getArgOperand(I);
116104db260SJay Foad       Value *Arg = II->getArgOperand(I);
117104db260SJay Foad       if (I == ImageDimIntr->VAddrEnd - 1) {
118d85d143aSJay Foad         // Check FragId group.
1198d13e7b8SJay Foad         auto *FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
1208d13e7b8SJay Foad         auto *FragId = cast<ConstantInt>(II->getArgOperand(I));
121104db260SJay Foad         AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122104db260SJay Foad       } else {
123104db260SJay Foad         // Check all arguments except FragId.
124104db260SJay Foad         AllEqual = ArgList == Arg;
125104db260SJay Foad       }
126104db260SJay Foad     }
127104db260SJay Foad     if (!AllEqual)
128d85d143aSJay Foad       continue;
129d85d143aSJay Foad 
130d85d143aSJay Foad     // Add to the list.
131d85d143aSJay Foad     IIList.emplace_back(II);
132d85d143aSJay Foad     return;
133d85d143aSJay Foad   }
134d85d143aSJay Foad 
135d85d143aSJay Foad   // Similar instruction not found, so add a new list.
136d85d143aSJay Foad   MergeableInsts.emplace_back(1, II);
137d85d143aSJay Foad   LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138d85d143aSJay Foad }
139d85d143aSJay Foad 
140d85d143aSJay Foad // Collect list of all instructions we know how to merge in a subset of the
141d85d143aSJay Foad // block. It returns an iterator to the instruction after the last one analyzed.
142d85d143aSJay Foad BasicBlock::iterator collectMergeableInsts(
143d85d143aSJay Foad     BasicBlock::iterator I, BasicBlock::iterator E,
144d85d143aSJay Foad     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
145d85d143aSJay Foad   for (; I != E; ++I) {
146d85d143aSJay Foad     // Don't combine if there is a store in the middle or if there is a memory
147d85d143aSJay Foad     // barrier.
148d85d143aSJay Foad     if (I->mayHaveSideEffects()) {
149d85d143aSJay Foad       ++I;
150d85d143aSJay Foad       break;
151d85d143aSJay Foad     }
152d85d143aSJay Foad 
153d85d143aSJay Foad     // Ignore non-intrinsics.
154d85d143aSJay Foad     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155d85d143aSJay Foad       Intrinsic::ID IntrinID = II->getIntrinsicID();
156d85d143aSJay Foad 
157d85d143aSJay Foad       // Ignore other intrinsics.
158d85d143aSJay Foad       if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159d85d143aSJay Foad           IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160d85d143aSJay Foad         continue;
161d85d143aSJay Foad 
162d85d143aSJay Foad       // Check for constant FragId.
163d85d143aSJay Foad       const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164d85d143aSJay Foad       const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165d85d143aSJay Foad       if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166d85d143aSJay Foad         continue;
167d85d143aSJay Foad 
168d85d143aSJay Foad       LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169d85d143aSJay Foad       addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170d85d143aSJay Foad     }
171d85d143aSJay Foad   }
172d85d143aSJay Foad 
173d85d143aSJay Foad   return I;
174d85d143aSJay Foad }
175d85d143aSJay Foad 
176d85d143aSJay Foad bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
177d85d143aSJay Foad   bool Modified = false;
178d85d143aSJay Foad 
179d85d143aSJay Foad   SmallVector<Instruction *, 4> InstrsToErase;
180d85d143aSJay Foad   for (const auto &IIList : MergeableInsts) {
181d85d143aSJay Foad     if (IIList.size() <= 1)
182d85d143aSJay Foad       continue;
183d85d143aSJay Foad 
184d85d143aSJay Foad     // Assume the arguments are unchanged and later override them, if needed.
185d85d143aSJay Foad     SmallVector<Value *, 16> Args(IIList.front()->args());
186d85d143aSJay Foad 
187d85d143aSJay Foad     // Validate function argument and return types, extracting overloaded
188d85d143aSJay Foad     // types along the way.
189d85d143aSJay Foad     SmallVector<Type *, 6> OverloadTys;
190d85d143aSJay Foad     Function *F = IIList.front()->getCalledFunction();
191d85d143aSJay Foad     if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192d85d143aSJay Foad       continue;
193d85d143aSJay Foad 
194d85d143aSJay Foad     Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195d85d143aSJay Foad     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196d85d143aSJay Foad         AMDGPU::getImageDimIntrinsicInfo(IntrinID);
197d85d143aSJay Foad 
198d85d143aSJay Foad     Type *EltTy = IIList.front()->getType()->getScalarType();
199d85d143aSJay Foad     Type *NewTy = FixedVectorType::get(EltTy, 4);
200d85d143aSJay Foad     OverloadTys[0] = NewTy;
201d85d143aSJay Foad     bool isD16 = EltTy->isHalfTy();
202d85d143aSJay Foad 
203d85d143aSJay Foad     ConstantInt *DMask = cast<ConstantInt>(
204d85d143aSJay Foad         IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205d85d143aSJay Foad     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206d85d143aSJay Foad     unsigned NumElts = popcount(DMaskVal);
207d85d143aSJay Foad 
208d85d143aSJay Foad     // Number of instructions and the number of vaddr/vdata dword transfers
209d85d143aSJay Foad     // should be reduced.
210d85d143aSJay Foad     unsigned NumLoads = IIList.size();
211d85d143aSJay Foad     unsigned NumMsaas = NumElts;
212d85d143aSJay Foad     unsigned NumVAddrLoads = 3 * NumLoads;
213d85d143aSJay Foad     unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214d85d143aSJay Foad     unsigned NumVAddrMsaas = 3 * NumMsaas;
215d85d143aSJay Foad     unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216d85d143aSJay Foad 
217d85d143aSJay Foad     if (NumLoads < NumMsaas ||
218d85d143aSJay Foad         (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219d85d143aSJay Foad       continue;
220d85d143aSJay Foad 
221d85d143aSJay Foad     const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
2228d13e7b8SJay Foad     auto *FragId =
2238d13e7b8SJay Foad         cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
224d85d143aSJay Foad     const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
225d85d143aSJay Foad 
226d85d143aSJay Foad     // Create the new instructions.
227d85d143aSJay Foad     IRBuilder<> B(IIList.front());
228d85d143aSJay Foad 
229d85d143aSJay Foad     // Create the new image_msaa_load intrinsic.
230d85d143aSJay Foad     SmallVector<Instruction *, 4> NewCalls;
231d85d143aSJay Foad     while (DMaskVal != 0) {
232d85d143aSJay Foad       unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
233d85d143aSJay Foad 
234d85d143aSJay Foad       Intrinsic::ID NewIntrinID;
235d85d143aSJay Foad       if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
236d85d143aSJay Foad         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
237d85d143aSJay Foad       else
238d85d143aSJay Foad         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
239d85d143aSJay Foad 
240d85d143aSJay Foad       Args[ImageDimIntr->DMaskIndex] =
241d85d143aSJay Foad           ConstantInt::get(DMask->getType(), NewMaskVal);
242d85d143aSJay Foad       Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
243*85c17e40SJay Foad       CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
244d85d143aSJay Foad       LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
245d85d143aSJay Foad 
246d85d143aSJay Foad       NewCalls.push_back(NewCall);
247d85d143aSJay Foad       DMaskVal -= NewMaskVal;
248d85d143aSJay Foad     }
249d85d143aSJay Foad 
250d85d143aSJay Foad     // Create the new extractelement instructions.
251d85d143aSJay Foad     for (auto &II : IIList) {
252d85d143aSJay Foad       Value *VecOp = nullptr;
2538d13e7b8SJay Foad       auto *Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
254d85d143aSJay Foad       B.SetCurrentDebugLocation(II->getDebugLoc());
255d85d143aSJay Foad       if (NumElts == 1) {
256d85d143aSJay Foad         VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
257d85d143aSJay Foad         LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
258d85d143aSJay Foad       } else {
259d85d143aSJay Foad         VecOp = UndefValue::get(II->getType());
260d85d143aSJay Foad         for (unsigned I = 0; I < NumElts; ++I) {
261d85d143aSJay Foad           VecOp = B.CreateInsertElement(
262d85d143aSJay Foad               VecOp,
263d85d143aSJay Foad               B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
264d85d143aSJay Foad           LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
265d85d143aSJay Foad         }
266d85d143aSJay Foad       }
267d85d143aSJay Foad 
268d85d143aSJay Foad       // Replace the old instruction.
269d85d143aSJay Foad       II->replaceAllUsesWith(VecOp);
270d85d143aSJay Foad       VecOp->takeName(II);
271d85d143aSJay Foad       InstrsToErase.push_back(II);
272d85d143aSJay Foad     }
273d85d143aSJay Foad 
274d85d143aSJay Foad     Modified = true;
275d85d143aSJay Foad   }
276d85d143aSJay Foad 
2778d13e7b8SJay Foad   for (auto *I : InstrsToErase)
278d85d143aSJay Foad     I->eraseFromParent();
279d85d143aSJay Foad 
280d85d143aSJay Foad   return Modified;
281d85d143aSJay Foad }
282d85d143aSJay Foad 
283d85d143aSJay Foad static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
284d85d143aSJay Foad   if (!TM)
285d85d143aSJay Foad     return false;
286d85d143aSJay Foad 
287d85d143aSJay Foad   // This optimization only applies to GFX11 and beyond.
288d85d143aSJay Foad   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
289d85d143aSJay Foad   if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
290d85d143aSJay Foad     return false;
291d85d143aSJay Foad 
292d85d143aSJay Foad   Module *M = F.getParent();
293d85d143aSJay Foad 
294d85d143aSJay Foad   // Early test to determine if the intrinsics are used.
29518a3c7a0SKazu Hirata   if (llvm::none_of(*M, [](Function &F) {
296d85d143aSJay Foad         return !F.users().empty() &&
297d85d143aSJay Foad                (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
298d85d143aSJay Foad                 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
299d85d143aSJay Foad       }))
300d85d143aSJay Foad     return false;
301d85d143aSJay Foad 
302d85d143aSJay Foad   bool Modified = false;
303d85d143aSJay Foad   for (auto &BB : F) {
304d85d143aSJay Foad     BasicBlock::iterator SectionEnd;
305d85d143aSJay Foad     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
306d85d143aSJay Foad          I = SectionEnd) {
307d85d143aSJay Foad       SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
308d85d143aSJay Foad 
309d85d143aSJay Foad       SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
310d85d143aSJay Foad       Modified |= optimizeSection(MergeableInsts);
311d85d143aSJay Foad     }
312d85d143aSJay Foad   }
313d85d143aSJay Foad 
314d85d143aSJay Foad   return Modified;
315d85d143aSJay Foad }
316d85d143aSJay Foad 
317d85d143aSJay Foad bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
318d85d143aSJay Foad   if (skipFunction(F))
319d85d143aSJay Foad     return false;
320d85d143aSJay Foad 
321d85d143aSJay Foad   return imageIntrinsicOptimizerImpl(F, TM);
322d85d143aSJay Foad }
323d85d143aSJay Foad 
324d85d143aSJay Foad FunctionPass *
325d85d143aSJay Foad llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
326d85d143aSJay Foad   return new AMDGPUImageIntrinsicOptimizer(TM);
327d85d143aSJay Foad }
328d85d143aSJay Foad 
329d85d143aSJay Foad PreservedAnalyses
330d85d143aSJay Foad AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
331d85d143aSJay Foad                                        FunctionAnalysisManager &AM) {
332d85d143aSJay Foad 
333d85d143aSJay Foad   bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
334d85d143aSJay Foad   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
335d85d143aSJay Foad }
336