1 //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa 10 // or dim=2darraymsaa into a single image_msaa_load intrinsic if: 11 // 12 // - they refer to the same vaddr except for sample_id, 13 // - they use a constant sample_id and they fall into the same group, 14 // - they have the same dmask and the number of intrinsics and the number of 15 // vaddr/vdata dword transfers is reduced by the combine. 16 // 17 // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): 18 // 19 // +----------+-----+-----+-------+---------+------------+---------+----------+ 20 // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | 21 // | (dmask) | | | | vdata | | vdata | | 22 // +----------+-----+-----+-------+---------+------------+---------+----------+ 23 // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | 24 // +----------+-----+-----+-------+---------+------------+---------+----------+ 25 // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | 26 // +----------+-----+-----+-------+---------+------------+---------+----------+ 27 // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | 28 // +----------+-----+-----+-------+---------+------------+---------+----------+ 29 // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | 30 // +----------+-----+-----+-------+---------+------------+---------+----------+ 31 // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | 32 // +----------+-----+-----+-------+---------+------------+---------+----------+ 33 // 34 // Some cases are of questionable benefit, like the one marked with "yes?" 35 // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP 36 // and TX, but higher vdata. We start by erring on the side of converting these 37 // to MSAA_LOAD. 38 // 39 // clang-format off 40 // 41 // This pass will combine intrinsics such as (not neccessarily consecutive): 42 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 43 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) 44 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) 45 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) 46 // ==> 47 // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 48 // 49 // clang-format on 50 // 51 // Future improvements: 52 // 53 // - We may occasionally not want to do the combine if it increases the maximum 54 // register pressure. 55 // 56 // - Ensure clausing when multiple MSAA_LOAD are generated. 57 // 58 // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this 59 // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 60 // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and 61 // we don't know the format at compile time. 62 //===----------------------------------------------------------------------===// 63 64 #include "AMDGPU.h" 65 #include "AMDGPUInstrInfo.h" 66 #include "AMDGPUTargetMachine.h" 67 #include "llvm/IR/Function.h" 68 #include "llvm/IR/IRBuilder.h" 69 #include "llvm/IR/IntrinsicInst.h" 70 #include "llvm/IR/IntrinsicsAMDGPU.h" 71 #include "llvm/Pass.h" 72 #include "llvm/Support/raw_ostream.h" 73 74 using namespace llvm; 75 76 #define DEBUG_TYPE "amdgpu-image-intrinsic-opt" 77 78 namespace { 79 class AMDGPUImageIntrinsicOptimizer : public FunctionPass { 80 const TargetMachine *TM; 81 82 public: 83 static char ID; 84 85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) 86 : FunctionPass(ID), TM(TM) {} 87 88 bool runOnFunction(Function &F) override; 89 90 }; // End of class AMDGPUImageIntrinsicOptimizer 91 } // End anonymous namespace 92 93 INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, 94 "AMDGPU Image Intrinsic Optimizer", false, false) 95 96 char AMDGPUImageIntrinsicOptimizer::ID = 0; 97 98 void addInstToMergeableList( 99 IntrinsicInst *II, 100 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, 101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { 102 for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { 103 // Check Dim. 104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) 105 continue; 106 107 // Check D16. 108 if (IIList.front()->getType() != II->getType()) 109 continue; 110 111 // Check DMask. 112 Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex); 113 Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex); 114 if (DMaskList != DMask) 115 continue; 116 117 // Check VAddr (except FragId). 118 int I = ImageDimIntr->VAddrStart; 119 for (; I < ImageDimIntr->VAddrEnd - 1; ++I) { 120 if (IIList.front()->getArgOperand(I) != II->getArgOperand(I)) 121 break; 122 } 123 124 if (I != ImageDimIntr->VAddrEnd - 1) 125 continue; 126 127 // Check FragId group. 128 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 129 Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex); 130 auto IIListFragId = cast<ConstantInt>(FragIdList); 131 auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); 132 if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4)) 133 continue; 134 135 // Add to the list. 136 IIList.emplace_back(II); 137 return; 138 } 139 140 // Similar instruction not found, so add a new list. 141 MergeableInsts.emplace_back(1, II); 142 LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); 143 } 144 145 // Collect list of all instructions we know how to merge in a subset of the 146 // block. It returns an iterator to the instruction after the last one analyzed. 147 BasicBlock::iterator collectMergeableInsts( 148 BasicBlock::iterator I, BasicBlock::iterator E, 149 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { 150 for (; I != E; ++I) { 151 // Don't combine if there is a store in the middle or if there is a memory 152 // barrier. 153 if (I->mayHaveSideEffects()) { 154 ++I; 155 break; 156 } 157 158 // Ignore non-intrinsics. 159 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 160 Intrinsic::ID IntrinID = II->getIntrinsicID(); 161 162 // Ignore other intrinsics. 163 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && 164 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) 165 continue; 166 167 // Check for constant FragId. 168 const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); 169 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 170 if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) 171 continue; 172 173 LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); 174 addInstToMergeableList(II, MergeableInsts, ImageDimIntr); 175 } 176 } 177 178 return I; 179 } 180 181 bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { 182 bool Modified = false; 183 184 SmallVector<Instruction *, 4> InstrsToErase; 185 for (const auto &IIList : MergeableInsts) { 186 if (IIList.size() <= 1) 187 continue; 188 189 // Assume the arguments are unchanged and later override them, if needed. 190 SmallVector<Value *, 16> Args(IIList.front()->args()); 191 192 // Validate function argument and return types, extracting overloaded 193 // types along the way. 194 SmallVector<Type *, 6> OverloadTys; 195 Function *F = IIList.front()->getCalledFunction(); 196 if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) 197 continue; 198 199 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); 200 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 201 AMDGPU::getImageDimIntrinsicInfo(IntrinID); 202 203 Type *EltTy = IIList.front()->getType()->getScalarType(); 204 Type *NewTy = FixedVectorType::get(EltTy, 4); 205 OverloadTys[0] = NewTy; 206 bool isD16 = EltTy->isHalfTy(); 207 208 ConstantInt *DMask = cast<ConstantInt>( 209 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); 210 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 211 unsigned NumElts = popcount(DMaskVal); 212 213 // Number of instructions and the number of vaddr/vdata dword transfers 214 // should be reduced. 215 unsigned NumLoads = IIList.size(); 216 unsigned NumMsaas = NumElts; 217 unsigned NumVAddrLoads = 3 * NumLoads; 218 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; 219 unsigned NumVAddrMsaas = 3 * NumMsaas; 220 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; 221 222 if (NumLoads < NumMsaas || 223 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) 224 continue; 225 226 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 227 auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); 228 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; 229 230 // Create the new instructions. 231 IRBuilder<> B(IIList.front()); 232 233 // Create the new image_msaa_load intrinsic. 234 SmallVector<Instruction *, 4> NewCalls; 235 while (DMaskVal != 0) { 236 unsigned NewMaskVal = 1 << countr_zero(DMaskVal); 237 238 Intrinsic::ID NewIntrinID; 239 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) 240 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; 241 else 242 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; 243 244 Function *NewIntrin = Intrinsic::getDeclaration( 245 IIList.front()->getModule(), NewIntrinID, OverloadTys); 246 Args[ImageDimIntr->DMaskIndex] = 247 ConstantInt::get(DMask->getType(), NewMaskVal); 248 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); 249 CallInst *NewCall = B.CreateCall(NewIntrin, Args); 250 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); 251 252 NewCalls.push_back(NewCall); 253 DMaskVal -= NewMaskVal; 254 } 255 256 // Create the new extractelement instructions. 257 for (auto &II : IIList) { 258 Value *VecOp = nullptr; 259 auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); 260 B.SetCurrentDebugLocation(II->getDebugLoc()); 261 if (NumElts == 1) { 262 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); 263 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 264 } else { 265 VecOp = UndefValue::get(II->getType()); 266 for (unsigned I = 0; I < NumElts; ++I) { 267 VecOp = B.CreateInsertElement( 268 VecOp, 269 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); 270 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 271 } 272 } 273 274 // Replace the old instruction. 275 II->replaceAllUsesWith(VecOp); 276 VecOp->takeName(II); 277 InstrsToErase.push_back(II); 278 } 279 280 Modified = true; 281 } 282 283 for (auto I : InstrsToErase) 284 I->eraseFromParent(); 285 286 return Modified; 287 } 288 289 static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { 290 if (!TM) 291 return false; 292 293 // This optimization only applies to GFX11 and beyond. 294 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 295 if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) 296 return false; 297 298 Module *M = F.getParent(); 299 300 // Early test to determine if the intrinsics are used. 301 if (std::none_of(M->begin(), M->end(), [](Function &F) { 302 return !F.users().empty() && 303 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || 304 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); 305 })) 306 return false; 307 308 bool Modified = false; 309 for (auto &BB : F) { 310 BasicBlock::iterator SectionEnd; 311 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; 312 I = SectionEnd) { 313 SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; 314 315 SectionEnd = collectMergeableInsts(I, E, MergeableInsts); 316 Modified |= optimizeSection(MergeableInsts); 317 } 318 } 319 320 return Modified; 321 } 322 323 bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { 324 if (skipFunction(F)) 325 return false; 326 327 return imageIntrinsicOptimizerImpl(F, TM); 328 } 329 330 FunctionPass * 331 llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { 332 return new AMDGPUImageIntrinsicOptimizer(TM); 333 } 334 335 PreservedAnalyses 336 AMDGPUImageIntrinsicOptimizerPass::run(Function &F, 337 FunctionAnalysisManager &AM) { 338 339 bool Changed = imageIntrinsicOptimizerImpl(F, &TM); 340 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 341 } 342