xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp (revision d85d143ad99006a5500b375bd199c087adf7778f)
1 //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10 // or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11 //
12 // - they refer to the same vaddr except for sample_id,
13 // - they use a constant sample_id and they fall into the same group,
14 // - they have the same dmask and the number of intrinsics and the number of
15 //   vaddr/vdata dword transfers is reduced by the combine.
16 //
17 // Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18 //
19 // +----------+-----+-----+-------+---------+------------+---------+----------+
20 // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21 // |  (dmask) |     |     |       | vdata   |            | vdata   |          |
22 // +----------+-----+-----+-------+---------+------------+---------+----------+
23 // |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
24 // +----------+-----+-----+-------+---------+------------+---------+----------+
25 // |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
26 // +----------+-----+-----+-------+---------+------------+---------+----------+
27 // |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
28 // +----------+-----+-----+-------+---------+------------+---------+----------+
29 // |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
30 // +----------+-----+-----+-------+---------+------------+---------+----------+
31 // |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
32 // +----------+-----+-----+-------+---------+------------+---------+----------+
33 //
34 // Some cases are of questionable benefit, like the one marked with "yes?"
35 // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36 // and TX, but higher vdata. We start by erring on the side of converting these
37 // to MSAA_LOAD.
38 //
39 // clang-format off
40 //
41 // This pass will combine intrinsics such as (not neccessarily consecutive):
42 //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43 //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44 //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45 //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46 // ==>
47 //  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48 //
49 // clang-format on
50 //
51 // Future improvements:
52 //
53 // - We may occasionally not want to do the combine if it increases the maximum
54 //   register pressure.
55 //
56 // - Ensure clausing when multiple MSAA_LOAD are generated.
57 //
58 // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59 // combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60 // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61 // we don't know the format at compile time.
62 //===----------------------------------------------------------------------===//
63 
64 #include "AMDGPU.h"
65 #include "AMDGPUInstrInfo.h"
66 #include "AMDGPUTargetMachine.h"
67 #include "llvm/IR/Function.h"
68 #include "llvm/IR/IRBuilder.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/IntrinsicsAMDGPU.h"
71 #include "llvm/Pass.h"
72 #include "llvm/Support/raw_ostream.h"
73 
74 using namespace llvm;
75 
76 #define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77 
78 namespace {
79 class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80   const TargetMachine *TM;
81 
82 public:
83   static char ID;
84 
85   AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86       : FunctionPass(ID), TM(TM) {}
87 
88   bool runOnFunction(Function &F) override;
89 
90 }; // End of class AMDGPUImageIntrinsicOptimizer
91 } // End anonymous namespace
92 
93 INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94                 "AMDGPU Image Intrinsic Optimizer", false, false)
95 
96 char AMDGPUImageIntrinsicOptimizer::ID = 0;
97 
98 void addInstToMergeableList(
99     IntrinsicInst *II,
100     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102   for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103     // Check Dim.
104     if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105       continue;
106 
107     // Check D16.
108     if (IIList.front()->getType() != II->getType())
109       continue;
110 
111     // Check DMask.
112     Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
113     Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
114     if (DMaskList != DMask)
115       continue;
116 
117     // Check VAddr (except FragId).
118     int I = ImageDimIntr->VAddrStart;
119     for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
120       if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
121         break;
122     }
123 
124     if (I != ImageDimIntr->VAddrEnd - 1)
125       continue;
126 
127     // Check FragId group.
128     const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
129     Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
130     auto IIListFragId = cast<ConstantInt>(FragIdList);
131     auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
132     if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
133       continue;
134 
135     // Add to the list.
136     IIList.emplace_back(II);
137     return;
138   }
139 
140   // Similar instruction not found, so add a new list.
141   MergeableInsts.emplace_back(1, II);
142   LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
143 }
144 
145 // Collect list of all instructions we know how to merge in a subset of the
146 // block. It returns an iterator to the instruction after the last one analyzed.
147 BasicBlock::iterator collectMergeableInsts(
148     BasicBlock::iterator I, BasicBlock::iterator E,
149     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
150   for (; I != E; ++I) {
151     // Don't combine if there is a store in the middle or if there is a memory
152     // barrier.
153     if (I->mayHaveSideEffects()) {
154       ++I;
155       break;
156     }
157 
158     // Ignore non-intrinsics.
159     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
160       Intrinsic::ID IntrinID = II->getIntrinsicID();
161 
162       // Ignore other intrinsics.
163       if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
164           IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
165         continue;
166 
167       // Check for constant FragId.
168       const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
169       const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
170       if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
171         continue;
172 
173       LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
174       addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
175     }
176   }
177 
178   return I;
179 }
180 
181 bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
182   bool Modified = false;
183 
184   SmallVector<Instruction *, 4> InstrsToErase;
185   for (const auto &IIList : MergeableInsts) {
186     if (IIList.size() <= 1)
187       continue;
188 
189     // Assume the arguments are unchanged and later override them, if needed.
190     SmallVector<Value *, 16> Args(IIList.front()->args());
191 
192     // Validate function argument and return types, extracting overloaded
193     // types along the way.
194     SmallVector<Type *, 6> OverloadTys;
195     Function *F = IIList.front()->getCalledFunction();
196     if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
197       continue;
198 
199     Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
200     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
201         AMDGPU::getImageDimIntrinsicInfo(IntrinID);
202 
203     Type *EltTy = IIList.front()->getType()->getScalarType();
204     Type *NewTy = FixedVectorType::get(EltTy, 4);
205     OverloadTys[0] = NewTy;
206     bool isD16 = EltTy->isHalfTy();
207 
208     ConstantInt *DMask = cast<ConstantInt>(
209         IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
210     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
211     unsigned NumElts = popcount(DMaskVal);
212 
213     // Number of instructions and the number of vaddr/vdata dword transfers
214     // should be reduced.
215     unsigned NumLoads = IIList.size();
216     unsigned NumMsaas = NumElts;
217     unsigned NumVAddrLoads = 3 * NumLoads;
218     unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
219     unsigned NumVAddrMsaas = 3 * NumMsaas;
220     unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
221 
222     if (NumLoads < NumMsaas ||
223         (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
224       continue;
225 
226     const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
227     auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
228     const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
229 
230     // Create the new instructions.
231     IRBuilder<> B(IIList.front());
232 
233     // Create the new image_msaa_load intrinsic.
234     SmallVector<Instruction *, 4> NewCalls;
235     while (DMaskVal != 0) {
236       unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
237 
238       Intrinsic::ID NewIntrinID;
239       if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
240         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
241       else
242         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
243 
244       Function *NewIntrin = Intrinsic::getDeclaration(
245           IIList.front()->getModule(), NewIntrinID, OverloadTys);
246       Args[ImageDimIntr->DMaskIndex] =
247           ConstantInt::get(DMask->getType(), NewMaskVal);
248       Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
249       CallInst *NewCall = B.CreateCall(NewIntrin, Args);
250       LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
251 
252       NewCalls.push_back(NewCall);
253       DMaskVal -= NewMaskVal;
254     }
255 
256     // Create the new extractelement instructions.
257     for (auto &II : IIList) {
258       Value *VecOp = nullptr;
259       auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
260       B.SetCurrentDebugLocation(II->getDebugLoc());
261       if (NumElts == 1) {
262         VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
263         LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
264       } else {
265         VecOp = UndefValue::get(II->getType());
266         for (unsigned I = 0; I < NumElts; ++I) {
267           VecOp = B.CreateInsertElement(
268               VecOp,
269               B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
270           LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
271         }
272       }
273 
274       // Replace the old instruction.
275       II->replaceAllUsesWith(VecOp);
276       VecOp->takeName(II);
277       InstrsToErase.push_back(II);
278     }
279 
280     Modified = true;
281   }
282 
283   for (auto I : InstrsToErase)
284     I->eraseFromParent();
285 
286   return Modified;
287 }
288 
289 static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
290   if (!TM)
291     return false;
292 
293   // This optimization only applies to GFX11 and beyond.
294   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
295   if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
296     return false;
297 
298   Module *M = F.getParent();
299 
300   // Early test to determine if the intrinsics are used.
301   if (std::none_of(M->begin(), M->end(), [](Function &F) {
302         return !F.users().empty() &&
303                (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
304                 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
305       }))
306     return false;
307 
308   bool Modified = false;
309   for (auto &BB : F) {
310     BasicBlock::iterator SectionEnd;
311     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
312          I = SectionEnd) {
313       SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
314 
315       SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
316       Modified |= optimizeSection(MergeableInsts);
317     }
318   }
319 
320   return Modified;
321 }
322 
323 bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
324   if (skipFunction(F))
325     return false;
326 
327   return imageIntrinsicOptimizerImpl(F, TM);
328 }
329 
330 FunctionPass *
331 llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
332   return new AMDGPUImageIntrinsicOptimizer(TM);
333 }
334 
335 PreservedAnalyses
336 AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
337                                        FunctionAnalysisManager &AM) {
338 
339   bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
340   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
341 }
342