xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
1*5f757f3fSDimitry Andric //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2*5f757f3fSDimitry Andric //
3*5f757f3fSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*5f757f3fSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*5f757f3fSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*5f757f3fSDimitry Andric //
7*5f757f3fSDimitry Andric //===----------------------------------------------------------------------===//
8*5f757f3fSDimitry Andric //
9*5f757f3fSDimitry Andric // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10*5f757f3fSDimitry Andric // or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11*5f757f3fSDimitry Andric //
12*5f757f3fSDimitry Andric // - they refer to the same vaddr except for sample_id,
13*5f757f3fSDimitry Andric // - they use a constant sample_id and they fall into the same group,
14*5f757f3fSDimitry Andric // - they have the same dmask and the number of intrinsics and the number of
15*5f757f3fSDimitry Andric //   vaddr/vdata dword transfers is reduced by the combine.
16*5f757f3fSDimitry Andric //
17*5f757f3fSDimitry Andric // Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18*5f757f3fSDimitry Andric //
19*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
20*5f757f3fSDimitry Andric // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21*5f757f3fSDimitry Andric // |  (dmask) |     |     |       | vdata   |            | vdata   |          |
22*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
23*5f757f3fSDimitry Andric // |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
24*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
25*5f757f3fSDimitry Andric // |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
26*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
27*5f757f3fSDimitry Andric // |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
28*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
29*5f757f3fSDimitry Andric // |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
30*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
31*5f757f3fSDimitry Andric // |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
32*5f757f3fSDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
33*5f757f3fSDimitry Andric //
34*5f757f3fSDimitry Andric // Some cases are of questionable benefit, like the one marked with "yes?"
35*5f757f3fSDimitry Andric // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36*5f757f3fSDimitry Andric // and TX, but higher vdata. We start by erring on the side of converting these
37*5f757f3fSDimitry Andric // to MSAA_LOAD.
38*5f757f3fSDimitry Andric //
39*5f757f3fSDimitry Andric // clang-format off
40*5f757f3fSDimitry Andric //
41*5f757f3fSDimitry Andric // This pass will combine intrinsics such as (not neccessarily consecutive):
42*5f757f3fSDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43*5f757f3fSDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44*5f757f3fSDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45*5f757f3fSDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46*5f757f3fSDimitry Andric // ==>
47*5f757f3fSDimitry Andric //  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48*5f757f3fSDimitry Andric //
49*5f757f3fSDimitry Andric // clang-format on
50*5f757f3fSDimitry Andric //
51*5f757f3fSDimitry Andric // Future improvements:
52*5f757f3fSDimitry Andric //
53*5f757f3fSDimitry Andric // - We may occasionally not want to do the combine if it increases the maximum
54*5f757f3fSDimitry Andric //   register pressure.
55*5f757f3fSDimitry Andric //
56*5f757f3fSDimitry Andric // - Ensure clausing when multiple MSAA_LOAD are generated.
57*5f757f3fSDimitry Andric //
58*5f757f3fSDimitry Andric // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59*5f757f3fSDimitry Andric // combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60*5f757f3fSDimitry Andric // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61*5f757f3fSDimitry Andric // we don't know the format at compile time.
62*5f757f3fSDimitry Andric //===----------------------------------------------------------------------===//
63*5f757f3fSDimitry Andric 
64*5f757f3fSDimitry Andric #include "AMDGPU.h"
65*5f757f3fSDimitry Andric #include "AMDGPUInstrInfo.h"
66*5f757f3fSDimitry Andric #include "AMDGPUTargetMachine.h"
67*5f757f3fSDimitry Andric #include "llvm/IR/Function.h"
68*5f757f3fSDimitry Andric #include "llvm/IR/IRBuilder.h"
69*5f757f3fSDimitry Andric #include "llvm/IR/IntrinsicInst.h"
70*5f757f3fSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
71*5f757f3fSDimitry Andric #include "llvm/Pass.h"
72*5f757f3fSDimitry Andric #include "llvm/Support/raw_ostream.h"
73*5f757f3fSDimitry Andric 
74*5f757f3fSDimitry Andric using namespace llvm;
75*5f757f3fSDimitry Andric 
76*5f757f3fSDimitry Andric #define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77*5f757f3fSDimitry Andric 
78*5f757f3fSDimitry Andric namespace {
79*5f757f3fSDimitry Andric class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80*5f757f3fSDimitry Andric   const TargetMachine *TM;
81*5f757f3fSDimitry Andric 
82*5f757f3fSDimitry Andric public:
83*5f757f3fSDimitry Andric   static char ID;
84*5f757f3fSDimitry Andric 
85*5f757f3fSDimitry Andric   AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86*5f757f3fSDimitry Andric       : FunctionPass(ID), TM(TM) {}
87*5f757f3fSDimitry Andric 
88*5f757f3fSDimitry Andric   bool runOnFunction(Function &F) override;
89*5f757f3fSDimitry Andric 
90*5f757f3fSDimitry Andric }; // End of class AMDGPUImageIntrinsicOptimizer
91*5f757f3fSDimitry Andric } // End anonymous namespace
92*5f757f3fSDimitry Andric 
93*5f757f3fSDimitry Andric INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94*5f757f3fSDimitry Andric                 "AMDGPU Image Intrinsic Optimizer", false, false)
95*5f757f3fSDimitry Andric 
96*5f757f3fSDimitry Andric char AMDGPUImageIntrinsicOptimizer::ID = 0;
97*5f757f3fSDimitry Andric 
98*5f757f3fSDimitry Andric void addInstToMergeableList(
99*5f757f3fSDimitry Andric     IntrinsicInst *II,
100*5f757f3fSDimitry Andric     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101*5f757f3fSDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102*5f757f3fSDimitry Andric   for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103*5f757f3fSDimitry Andric     // Check Dim.
104*5f757f3fSDimitry Andric     if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105*5f757f3fSDimitry Andric       continue;
106*5f757f3fSDimitry Andric 
107*5f757f3fSDimitry Andric     // Check D16.
108*5f757f3fSDimitry Andric     if (IIList.front()->getType() != II->getType())
109*5f757f3fSDimitry Andric       continue;
110*5f757f3fSDimitry Andric 
111*5f757f3fSDimitry Andric     // Check all arguments (DMask, VAddr, RSrc etc).
112*5f757f3fSDimitry Andric     bool AllEqual = true;
113*5f757f3fSDimitry Andric     assert(IIList.front()->arg_size() == II->arg_size());
114*5f757f3fSDimitry Andric     for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115*5f757f3fSDimitry Andric       Value *ArgList = IIList.front()->getArgOperand(I);
116*5f757f3fSDimitry Andric       Value *Arg = II->getArgOperand(I);
117*5f757f3fSDimitry Andric       if (I == ImageDimIntr->VAddrEnd - 1) {
118*5f757f3fSDimitry Andric         // Check FragId group.
119*5f757f3fSDimitry Andric         auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120*5f757f3fSDimitry Andric         auto FragId = cast<ConstantInt>(II->getArgOperand(I));
121*5f757f3fSDimitry Andric         AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122*5f757f3fSDimitry Andric       } else {
123*5f757f3fSDimitry Andric         // Check all arguments except FragId.
124*5f757f3fSDimitry Andric         AllEqual = ArgList == Arg;
125*5f757f3fSDimitry Andric       }
126*5f757f3fSDimitry Andric     }
127*5f757f3fSDimitry Andric     if (!AllEqual)
128*5f757f3fSDimitry Andric       continue;
129*5f757f3fSDimitry Andric 
130*5f757f3fSDimitry Andric     // Add to the list.
131*5f757f3fSDimitry Andric     IIList.emplace_back(II);
132*5f757f3fSDimitry Andric     return;
133*5f757f3fSDimitry Andric   }
134*5f757f3fSDimitry Andric 
135*5f757f3fSDimitry Andric   // Similar instruction not found, so add a new list.
136*5f757f3fSDimitry Andric   MergeableInsts.emplace_back(1, II);
137*5f757f3fSDimitry Andric   LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138*5f757f3fSDimitry Andric }
139*5f757f3fSDimitry Andric 
140*5f757f3fSDimitry Andric // Collect list of all instructions we know how to merge in a subset of the
141*5f757f3fSDimitry Andric // block. It returns an iterator to the instruction after the last one analyzed.
142*5f757f3fSDimitry Andric BasicBlock::iterator collectMergeableInsts(
143*5f757f3fSDimitry Andric     BasicBlock::iterator I, BasicBlock::iterator E,
144*5f757f3fSDimitry Andric     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
145*5f757f3fSDimitry Andric   for (; I != E; ++I) {
146*5f757f3fSDimitry Andric     // Don't combine if there is a store in the middle or if there is a memory
147*5f757f3fSDimitry Andric     // barrier.
148*5f757f3fSDimitry Andric     if (I->mayHaveSideEffects()) {
149*5f757f3fSDimitry Andric       ++I;
150*5f757f3fSDimitry Andric       break;
151*5f757f3fSDimitry Andric     }
152*5f757f3fSDimitry Andric 
153*5f757f3fSDimitry Andric     // Ignore non-intrinsics.
154*5f757f3fSDimitry Andric     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155*5f757f3fSDimitry Andric       Intrinsic::ID IntrinID = II->getIntrinsicID();
156*5f757f3fSDimitry Andric 
157*5f757f3fSDimitry Andric       // Ignore other intrinsics.
158*5f757f3fSDimitry Andric       if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159*5f757f3fSDimitry Andric           IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160*5f757f3fSDimitry Andric         continue;
161*5f757f3fSDimitry Andric 
162*5f757f3fSDimitry Andric       // Check for constant FragId.
163*5f757f3fSDimitry Andric       const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164*5f757f3fSDimitry Andric       const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165*5f757f3fSDimitry Andric       if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166*5f757f3fSDimitry Andric         continue;
167*5f757f3fSDimitry Andric 
168*5f757f3fSDimitry Andric       LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169*5f757f3fSDimitry Andric       addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170*5f757f3fSDimitry Andric     }
171*5f757f3fSDimitry Andric   }
172*5f757f3fSDimitry Andric 
173*5f757f3fSDimitry Andric   return I;
174*5f757f3fSDimitry Andric }
175*5f757f3fSDimitry Andric 
176*5f757f3fSDimitry Andric bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
177*5f757f3fSDimitry Andric   bool Modified = false;
178*5f757f3fSDimitry Andric 
179*5f757f3fSDimitry Andric   SmallVector<Instruction *, 4> InstrsToErase;
180*5f757f3fSDimitry Andric   for (const auto &IIList : MergeableInsts) {
181*5f757f3fSDimitry Andric     if (IIList.size() <= 1)
182*5f757f3fSDimitry Andric       continue;
183*5f757f3fSDimitry Andric 
184*5f757f3fSDimitry Andric     // Assume the arguments are unchanged and later override them, if needed.
185*5f757f3fSDimitry Andric     SmallVector<Value *, 16> Args(IIList.front()->args());
186*5f757f3fSDimitry Andric 
187*5f757f3fSDimitry Andric     // Validate function argument and return types, extracting overloaded
188*5f757f3fSDimitry Andric     // types along the way.
189*5f757f3fSDimitry Andric     SmallVector<Type *, 6> OverloadTys;
190*5f757f3fSDimitry Andric     Function *F = IIList.front()->getCalledFunction();
191*5f757f3fSDimitry Andric     if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192*5f757f3fSDimitry Andric       continue;
193*5f757f3fSDimitry Andric 
194*5f757f3fSDimitry Andric     Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195*5f757f3fSDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196*5f757f3fSDimitry Andric         AMDGPU::getImageDimIntrinsicInfo(IntrinID);
197*5f757f3fSDimitry Andric 
198*5f757f3fSDimitry Andric     Type *EltTy = IIList.front()->getType()->getScalarType();
199*5f757f3fSDimitry Andric     Type *NewTy = FixedVectorType::get(EltTy, 4);
200*5f757f3fSDimitry Andric     OverloadTys[0] = NewTy;
201*5f757f3fSDimitry Andric     bool isD16 = EltTy->isHalfTy();
202*5f757f3fSDimitry Andric 
203*5f757f3fSDimitry Andric     ConstantInt *DMask = cast<ConstantInt>(
204*5f757f3fSDimitry Andric         IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205*5f757f3fSDimitry Andric     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206*5f757f3fSDimitry Andric     unsigned NumElts = popcount(DMaskVal);
207*5f757f3fSDimitry Andric 
208*5f757f3fSDimitry Andric     // Number of instructions and the number of vaddr/vdata dword transfers
209*5f757f3fSDimitry Andric     // should be reduced.
210*5f757f3fSDimitry Andric     unsigned NumLoads = IIList.size();
211*5f757f3fSDimitry Andric     unsigned NumMsaas = NumElts;
212*5f757f3fSDimitry Andric     unsigned NumVAddrLoads = 3 * NumLoads;
213*5f757f3fSDimitry Andric     unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214*5f757f3fSDimitry Andric     unsigned NumVAddrMsaas = 3 * NumMsaas;
215*5f757f3fSDimitry Andric     unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216*5f757f3fSDimitry Andric 
217*5f757f3fSDimitry Andric     if (NumLoads < NumMsaas ||
218*5f757f3fSDimitry Andric         (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219*5f757f3fSDimitry Andric       continue;
220*5f757f3fSDimitry Andric 
221*5f757f3fSDimitry Andric     const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
222*5f757f3fSDimitry Andric     auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
223*5f757f3fSDimitry Andric     const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
224*5f757f3fSDimitry Andric 
225*5f757f3fSDimitry Andric     // Create the new instructions.
226*5f757f3fSDimitry Andric     IRBuilder<> B(IIList.front());
227*5f757f3fSDimitry Andric 
228*5f757f3fSDimitry Andric     // Create the new image_msaa_load intrinsic.
229*5f757f3fSDimitry Andric     SmallVector<Instruction *, 4> NewCalls;
230*5f757f3fSDimitry Andric     while (DMaskVal != 0) {
231*5f757f3fSDimitry Andric       unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
232*5f757f3fSDimitry Andric 
233*5f757f3fSDimitry Andric       Intrinsic::ID NewIntrinID;
234*5f757f3fSDimitry Andric       if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235*5f757f3fSDimitry Andric         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
236*5f757f3fSDimitry Andric       else
237*5f757f3fSDimitry Andric         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
238*5f757f3fSDimitry Andric 
239*5f757f3fSDimitry Andric       Function *NewIntrin = Intrinsic::getDeclaration(
240*5f757f3fSDimitry Andric           IIList.front()->getModule(), NewIntrinID, OverloadTys);
241*5f757f3fSDimitry Andric       Args[ImageDimIntr->DMaskIndex] =
242*5f757f3fSDimitry Andric           ConstantInt::get(DMask->getType(), NewMaskVal);
243*5f757f3fSDimitry Andric       Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
244*5f757f3fSDimitry Andric       CallInst *NewCall = B.CreateCall(NewIntrin, Args);
245*5f757f3fSDimitry Andric       LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
246*5f757f3fSDimitry Andric 
247*5f757f3fSDimitry Andric       NewCalls.push_back(NewCall);
248*5f757f3fSDimitry Andric       DMaskVal -= NewMaskVal;
249*5f757f3fSDimitry Andric     }
250*5f757f3fSDimitry Andric 
251*5f757f3fSDimitry Andric     // Create the new extractelement instructions.
252*5f757f3fSDimitry Andric     for (auto &II : IIList) {
253*5f757f3fSDimitry Andric       Value *VecOp = nullptr;
254*5f757f3fSDimitry Andric       auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
255*5f757f3fSDimitry Andric       B.SetCurrentDebugLocation(II->getDebugLoc());
256*5f757f3fSDimitry Andric       if (NumElts == 1) {
257*5f757f3fSDimitry Andric         VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258*5f757f3fSDimitry Andric         LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
259*5f757f3fSDimitry Andric       } else {
260*5f757f3fSDimitry Andric         VecOp = UndefValue::get(II->getType());
261*5f757f3fSDimitry Andric         for (unsigned I = 0; I < NumElts; ++I) {
262*5f757f3fSDimitry Andric           VecOp = B.CreateInsertElement(
263*5f757f3fSDimitry Andric               VecOp,
264*5f757f3fSDimitry Andric               B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265*5f757f3fSDimitry Andric           LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
266*5f757f3fSDimitry Andric         }
267*5f757f3fSDimitry Andric       }
268*5f757f3fSDimitry Andric 
269*5f757f3fSDimitry Andric       // Replace the old instruction.
270*5f757f3fSDimitry Andric       II->replaceAllUsesWith(VecOp);
271*5f757f3fSDimitry Andric       VecOp->takeName(II);
272*5f757f3fSDimitry Andric       InstrsToErase.push_back(II);
273*5f757f3fSDimitry Andric     }
274*5f757f3fSDimitry Andric 
275*5f757f3fSDimitry Andric     Modified = true;
276*5f757f3fSDimitry Andric   }
277*5f757f3fSDimitry Andric 
278*5f757f3fSDimitry Andric   for (auto I : InstrsToErase)
279*5f757f3fSDimitry Andric     I->eraseFromParent();
280*5f757f3fSDimitry Andric 
281*5f757f3fSDimitry Andric   return Modified;
282*5f757f3fSDimitry Andric }
283*5f757f3fSDimitry Andric 
284*5f757f3fSDimitry Andric static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
285*5f757f3fSDimitry Andric   if (!TM)
286*5f757f3fSDimitry Andric     return false;
287*5f757f3fSDimitry Andric 
288*5f757f3fSDimitry Andric   // This optimization only applies to GFX11 and beyond.
289*5f757f3fSDimitry Andric   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
290*5f757f3fSDimitry Andric   if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
291*5f757f3fSDimitry Andric     return false;
292*5f757f3fSDimitry Andric 
293*5f757f3fSDimitry Andric   Module *M = F.getParent();
294*5f757f3fSDimitry Andric 
295*5f757f3fSDimitry Andric   // Early test to determine if the intrinsics are used.
296*5f757f3fSDimitry Andric   if (std::none_of(M->begin(), M->end(), [](Function &F) {
297*5f757f3fSDimitry Andric         return !F.users().empty() &&
298*5f757f3fSDimitry Andric                (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
299*5f757f3fSDimitry Andric                 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
300*5f757f3fSDimitry Andric       }))
301*5f757f3fSDimitry Andric     return false;
302*5f757f3fSDimitry Andric 
303*5f757f3fSDimitry Andric   bool Modified = false;
304*5f757f3fSDimitry Andric   for (auto &BB : F) {
305*5f757f3fSDimitry Andric     BasicBlock::iterator SectionEnd;
306*5f757f3fSDimitry Andric     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
307*5f757f3fSDimitry Andric          I = SectionEnd) {
308*5f757f3fSDimitry Andric       SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
309*5f757f3fSDimitry Andric 
310*5f757f3fSDimitry Andric       SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
311*5f757f3fSDimitry Andric       Modified |= optimizeSection(MergeableInsts);
312*5f757f3fSDimitry Andric     }
313*5f757f3fSDimitry Andric   }
314*5f757f3fSDimitry Andric 
315*5f757f3fSDimitry Andric   return Modified;
316*5f757f3fSDimitry Andric }
317*5f757f3fSDimitry Andric 
318*5f757f3fSDimitry Andric bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
319*5f757f3fSDimitry Andric   if (skipFunction(F))
320*5f757f3fSDimitry Andric     return false;
321*5f757f3fSDimitry Andric 
322*5f757f3fSDimitry Andric   return imageIntrinsicOptimizerImpl(F, TM);
323*5f757f3fSDimitry Andric }
324*5f757f3fSDimitry Andric 
325*5f757f3fSDimitry Andric FunctionPass *
326*5f757f3fSDimitry Andric llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
327*5f757f3fSDimitry Andric   return new AMDGPUImageIntrinsicOptimizer(TM);
328*5f757f3fSDimitry Andric }
329*5f757f3fSDimitry Andric 
330*5f757f3fSDimitry Andric PreservedAnalyses
331*5f757f3fSDimitry Andric AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
332*5f757f3fSDimitry Andric                                        FunctionAnalysisManager &AM) {
333*5f757f3fSDimitry Andric 
334*5f757f3fSDimitry Andric   bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
335*5f757f3fSDimitry Andric   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
336*5f757f3fSDimitry Andric }
337