1d85d143aSJay Foad //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// 2d85d143aSJay Foad // 3d85d143aSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4d85d143aSJay Foad // See https://llvm.org/LICENSE.txt for license information. 5d85d143aSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6d85d143aSJay Foad // 7d85d143aSJay Foad //===----------------------------------------------------------------------===// 8d85d143aSJay Foad // 9d85d143aSJay Foad // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa 10d85d143aSJay Foad // or dim=2darraymsaa into a single image_msaa_load intrinsic if: 11d85d143aSJay Foad // 12d85d143aSJay Foad // - they refer to the same vaddr except for sample_id, 13d85d143aSJay Foad // - they use a constant sample_id and they fall into the same group, 14d85d143aSJay Foad // - they have the same dmask and the number of intrinsics and the number of 15d85d143aSJay Foad // vaddr/vdata dword transfers is reduced by the combine. 16d85d143aSJay Foad // 17d85d143aSJay Foad // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): 18d85d143aSJay Foad // 19d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 20d85d143aSJay Foad // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | 21d85d143aSJay Foad // | (dmask) | | | | vdata | | vdata | | 22d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 23d85d143aSJay Foad // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | 24d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 25d85d143aSJay Foad // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | 26d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 27d85d143aSJay Foad // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | 28d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 29d85d143aSJay Foad // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | 30d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 31d85d143aSJay Foad // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | 32d85d143aSJay Foad // +----------+-----+-----+-------+---------+------------+---------+----------+ 33d85d143aSJay Foad // 34d85d143aSJay Foad // Some cases are of questionable benefit, like the one marked with "yes?" 35d85d143aSJay Foad // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP 36d85d143aSJay Foad // and TX, but higher vdata. We start by erring on the side of converting these 37d85d143aSJay Foad // to MSAA_LOAD. 38d85d143aSJay Foad // 39d85d143aSJay Foad // clang-format off 40d85d143aSJay Foad // 41d85d143aSJay Foad // This pass will combine intrinsics such as (not neccessarily consecutive): 42d85d143aSJay Foad // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 43d85d143aSJay Foad // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) 44d85d143aSJay Foad // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) 45d85d143aSJay Foad // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) 46d85d143aSJay Foad // ==> 47d85d143aSJay Foad // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 48d85d143aSJay Foad // 49d85d143aSJay Foad // clang-format on 50d85d143aSJay Foad // 51d85d143aSJay Foad // Future improvements: 52d85d143aSJay Foad // 53d85d143aSJay Foad // - We may occasionally not want to do the combine if it increases the maximum 54d85d143aSJay Foad // register pressure. 55d85d143aSJay Foad // 56d85d143aSJay Foad // - Ensure clausing when multiple MSAA_LOAD are generated. 57d85d143aSJay Foad // 58d85d143aSJay Foad // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this 59d85d143aSJay Foad // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 60d85d143aSJay Foad // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and 61d85d143aSJay Foad // we don't know the format at compile time. 62d85d143aSJay Foad //===----------------------------------------------------------------------===// 63d85d143aSJay Foad 64d85d143aSJay Foad #include "AMDGPU.h" 65d85d143aSJay Foad #include "AMDGPUInstrInfo.h" 66d85d143aSJay Foad #include "AMDGPUTargetMachine.h" 67d85d143aSJay Foad #include "llvm/IR/Function.h" 68d85d143aSJay Foad #include "llvm/IR/IRBuilder.h" 69d85d143aSJay Foad #include "llvm/IR/IntrinsicInst.h" 70d85d143aSJay Foad #include "llvm/IR/IntrinsicsAMDGPU.h" 71d85d143aSJay Foad #include "llvm/Pass.h" 72d85d143aSJay Foad #include "llvm/Support/raw_ostream.h" 73d85d143aSJay Foad 74d85d143aSJay Foad using namespace llvm; 75d85d143aSJay Foad 76d85d143aSJay Foad #define DEBUG_TYPE "amdgpu-image-intrinsic-opt" 77d85d143aSJay Foad 78d85d143aSJay Foad namespace { 79d85d143aSJay Foad class AMDGPUImageIntrinsicOptimizer : public FunctionPass { 80d85d143aSJay Foad const TargetMachine *TM; 81d85d143aSJay Foad 82d85d143aSJay Foad public: 83d85d143aSJay Foad static char ID; 84d85d143aSJay Foad 85d85d143aSJay Foad AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) 86d85d143aSJay Foad : FunctionPass(ID), TM(TM) {} 87d85d143aSJay Foad 88d85d143aSJay Foad bool runOnFunction(Function &F) override; 89d85d143aSJay Foad 90d85d143aSJay Foad }; // End of class AMDGPUImageIntrinsicOptimizer 91d85d143aSJay Foad } // End anonymous namespace 92d85d143aSJay Foad 93d85d143aSJay Foad INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, 94d85d143aSJay Foad "AMDGPU Image Intrinsic Optimizer", false, false) 95d85d143aSJay Foad 96d85d143aSJay Foad char AMDGPUImageIntrinsicOptimizer::ID = 0; 97d85d143aSJay Foad 98d85d143aSJay Foad void addInstToMergeableList( 99d85d143aSJay Foad IntrinsicInst *II, 100d85d143aSJay Foad SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, 101d85d143aSJay Foad const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { 102d85d143aSJay Foad for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { 103d85d143aSJay Foad // Check Dim. 104d85d143aSJay Foad if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) 105d85d143aSJay Foad continue; 106d85d143aSJay Foad 107d85d143aSJay Foad // Check D16. 108d85d143aSJay Foad if (IIList.front()->getType() != II->getType()) 109d85d143aSJay Foad continue; 110d85d143aSJay Foad 111104db260SJay Foad // Check all arguments (DMask, VAddr, RSrc etc). 112104db260SJay Foad bool AllEqual = true; 113104db260SJay Foad assert(IIList.front()->arg_size() == II->arg_size()); 114104db260SJay Foad for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { 115104db260SJay Foad Value *ArgList = IIList.front()->getArgOperand(I); 116104db260SJay Foad Value *Arg = II->getArgOperand(I); 117104db260SJay Foad if (I == ImageDimIntr->VAddrEnd - 1) { 118d85d143aSJay Foad // Check FragId group. 1198d13e7b8SJay Foad auto *FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I)); 1208d13e7b8SJay Foad auto *FragId = cast<ConstantInt>(II->getArgOperand(I)); 121104db260SJay Foad AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); 122104db260SJay Foad } else { 123104db260SJay Foad // Check all arguments except FragId. 124104db260SJay Foad AllEqual = ArgList == Arg; 125104db260SJay Foad } 126104db260SJay Foad } 127104db260SJay Foad if (!AllEqual) 128d85d143aSJay Foad continue; 129d85d143aSJay Foad 130d85d143aSJay Foad // Add to the list. 131d85d143aSJay Foad IIList.emplace_back(II); 132d85d143aSJay Foad return; 133d85d143aSJay Foad } 134d85d143aSJay Foad 135d85d143aSJay Foad // Similar instruction not found, so add a new list. 136d85d143aSJay Foad MergeableInsts.emplace_back(1, II); 137d85d143aSJay Foad LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); 138d85d143aSJay Foad } 139d85d143aSJay Foad 140d85d143aSJay Foad // Collect list of all instructions we know how to merge in a subset of the 141d85d143aSJay Foad // block. It returns an iterator to the instruction after the last one analyzed. 142d85d143aSJay Foad BasicBlock::iterator collectMergeableInsts( 143d85d143aSJay Foad BasicBlock::iterator I, BasicBlock::iterator E, 144d85d143aSJay Foad SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { 145d85d143aSJay Foad for (; I != E; ++I) { 146d85d143aSJay Foad // Don't combine if there is a store in the middle or if there is a memory 147d85d143aSJay Foad // barrier. 148d85d143aSJay Foad if (I->mayHaveSideEffects()) { 149d85d143aSJay Foad ++I; 150d85d143aSJay Foad break; 151d85d143aSJay Foad } 152d85d143aSJay Foad 153d85d143aSJay Foad // Ignore non-intrinsics. 154d85d143aSJay Foad if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 155d85d143aSJay Foad Intrinsic::ID IntrinID = II->getIntrinsicID(); 156d85d143aSJay Foad 157d85d143aSJay Foad // Ignore other intrinsics. 158d85d143aSJay Foad if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && 159d85d143aSJay Foad IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) 160d85d143aSJay Foad continue; 161d85d143aSJay Foad 162d85d143aSJay Foad // Check for constant FragId. 163d85d143aSJay Foad const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); 164d85d143aSJay Foad const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 165d85d143aSJay Foad if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) 166d85d143aSJay Foad continue; 167d85d143aSJay Foad 168d85d143aSJay Foad LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); 169d85d143aSJay Foad addInstToMergeableList(II, MergeableInsts, ImageDimIntr); 170d85d143aSJay Foad } 171d85d143aSJay Foad } 172d85d143aSJay Foad 173d85d143aSJay Foad return I; 174d85d143aSJay Foad } 175d85d143aSJay Foad 176d85d143aSJay Foad bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { 177d85d143aSJay Foad bool Modified = false; 178d85d143aSJay Foad 179d85d143aSJay Foad SmallVector<Instruction *, 4> InstrsToErase; 180d85d143aSJay Foad for (const auto &IIList : MergeableInsts) { 181d85d143aSJay Foad if (IIList.size() <= 1) 182d85d143aSJay Foad continue; 183d85d143aSJay Foad 184d85d143aSJay Foad // Assume the arguments are unchanged and later override them, if needed. 185d85d143aSJay Foad SmallVector<Value *, 16> Args(IIList.front()->args()); 186d85d143aSJay Foad 187d85d143aSJay Foad // Validate function argument and return types, extracting overloaded 188d85d143aSJay Foad // types along the way. 189d85d143aSJay Foad SmallVector<Type *, 6> OverloadTys; 190d85d143aSJay Foad Function *F = IIList.front()->getCalledFunction(); 191d85d143aSJay Foad if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) 192d85d143aSJay Foad continue; 193d85d143aSJay Foad 194d85d143aSJay Foad Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); 195d85d143aSJay Foad const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 196d85d143aSJay Foad AMDGPU::getImageDimIntrinsicInfo(IntrinID); 197d85d143aSJay Foad 198d85d143aSJay Foad Type *EltTy = IIList.front()->getType()->getScalarType(); 199d85d143aSJay Foad Type *NewTy = FixedVectorType::get(EltTy, 4); 200d85d143aSJay Foad OverloadTys[0] = NewTy; 201d85d143aSJay Foad bool isD16 = EltTy->isHalfTy(); 202d85d143aSJay Foad 203d85d143aSJay Foad ConstantInt *DMask = cast<ConstantInt>( 204d85d143aSJay Foad IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); 205d85d143aSJay Foad unsigned DMaskVal = DMask->getZExtValue() & 0xf; 206d85d143aSJay Foad unsigned NumElts = popcount(DMaskVal); 207d85d143aSJay Foad 208d85d143aSJay Foad // Number of instructions and the number of vaddr/vdata dword transfers 209d85d143aSJay Foad // should be reduced. 210d85d143aSJay Foad unsigned NumLoads = IIList.size(); 211d85d143aSJay Foad unsigned NumMsaas = NumElts; 212d85d143aSJay Foad unsigned NumVAddrLoads = 3 * NumLoads; 213d85d143aSJay Foad unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; 214d85d143aSJay Foad unsigned NumVAddrMsaas = 3 * NumMsaas; 215d85d143aSJay Foad unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; 216d85d143aSJay Foad 217d85d143aSJay Foad if (NumLoads < NumMsaas || 218d85d143aSJay Foad (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) 219d85d143aSJay Foad continue; 220d85d143aSJay Foad 221d85d143aSJay Foad const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 2228d13e7b8SJay Foad auto *FragId = 2238d13e7b8SJay Foad cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); 224d85d143aSJay Foad const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; 225d85d143aSJay Foad 226d85d143aSJay Foad // Create the new instructions. 227d85d143aSJay Foad IRBuilder<> B(IIList.front()); 228d85d143aSJay Foad 229d85d143aSJay Foad // Create the new image_msaa_load intrinsic. 230d85d143aSJay Foad SmallVector<Instruction *, 4> NewCalls; 231d85d143aSJay Foad while (DMaskVal != 0) { 232d85d143aSJay Foad unsigned NewMaskVal = 1 << countr_zero(DMaskVal); 233d85d143aSJay Foad 234d85d143aSJay Foad Intrinsic::ID NewIntrinID; 235d85d143aSJay Foad if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) 236d85d143aSJay Foad NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; 237d85d143aSJay Foad else 238d85d143aSJay Foad NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; 239d85d143aSJay Foad 240d85d143aSJay Foad Args[ImageDimIntr->DMaskIndex] = 241d85d143aSJay Foad ConstantInt::get(DMask->getType(), NewMaskVal); 242d85d143aSJay Foad Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); 243*85c17e40SJay Foad CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args); 244d85d143aSJay Foad LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); 245d85d143aSJay Foad 246d85d143aSJay Foad NewCalls.push_back(NewCall); 247d85d143aSJay Foad DMaskVal -= NewMaskVal; 248d85d143aSJay Foad } 249d85d143aSJay Foad 250d85d143aSJay Foad // Create the new extractelement instructions. 251d85d143aSJay Foad for (auto &II : IIList) { 252d85d143aSJay Foad Value *VecOp = nullptr; 2538d13e7b8SJay Foad auto *Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); 254d85d143aSJay Foad B.SetCurrentDebugLocation(II->getDebugLoc()); 255d85d143aSJay Foad if (NumElts == 1) { 256d85d143aSJay Foad VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); 257d85d143aSJay Foad LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 258d85d143aSJay Foad } else { 259d85d143aSJay Foad VecOp = UndefValue::get(II->getType()); 260d85d143aSJay Foad for (unsigned I = 0; I < NumElts; ++I) { 261d85d143aSJay Foad VecOp = B.CreateInsertElement( 262d85d143aSJay Foad VecOp, 263d85d143aSJay Foad B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); 264d85d143aSJay Foad LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 265d85d143aSJay Foad } 266d85d143aSJay Foad } 267d85d143aSJay Foad 268d85d143aSJay Foad // Replace the old instruction. 269d85d143aSJay Foad II->replaceAllUsesWith(VecOp); 270d85d143aSJay Foad VecOp->takeName(II); 271d85d143aSJay Foad InstrsToErase.push_back(II); 272d85d143aSJay Foad } 273d85d143aSJay Foad 274d85d143aSJay Foad Modified = true; 275d85d143aSJay Foad } 276d85d143aSJay Foad 2778d13e7b8SJay Foad for (auto *I : InstrsToErase) 278d85d143aSJay Foad I->eraseFromParent(); 279d85d143aSJay Foad 280d85d143aSJay Foad return Modified; 281d85d143aSJay Foad } 282d85d143aSJay Foad 283d85d143aSJay Foad static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { 284d85d143aSJay Foad if (!TM) 285d85d143aSJay Foad return false; 286d85d143aSJay Foad 287d85d143aSJay Foad // This optimization only applies to GFX11 and beyond. 288d85d143aSJay Foad const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 289d85d143aSJay Foad if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) 290d85d143aSJay Foad return false; 291d85d143aSJay Foad 292d85d143aSJay Foad Module *M = F.getParent(); 293d85d143aSJay Foad 294d85d143aSJay Foad // Early test to determine if the intrinsics are used. 29518a3c7a0SKazu Hirata if (llvm::none_of(*M, [](Function &F) { 296d85d143aSJay Foad return !F.users().empty() && 297d85d143aSJay Foad (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || 298d85d143aSJay Foad F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); 299d85d143aSJay Foad })) 300d85d143aSJay Foad return false; 301d85d143aSJay Foad 302d85d143aSJay Foad bool Modified = false; 303d85d143aSJay Foad for (auto &BB : F) { 304d85d143aSJay Foad BasicBlock::iterator SectionEnd; 305d85d143aSJay Foad for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; 306d85d143aSJay Foad I = SectionEnd) { 307d85d143aSJay Foad SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; 308d85d143aSJay Foad 309d85d143aSJay Foad SectionEnd = collectMergeableInsts(I, E, MergeableInsts); 310d85d143aSJay Foad Modified |= optimizeSection(MergeableInsts); 311d85d143aSJay Foad } 312d85d143aSJay Foad } 313d85d143aSJay Foad 314d85d143aSJay Foad return Modified; 315d85d143aSJay Foad } 316d85d143aSJay Foad 317d85d143aSJay Foad bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { 318d85d143aSJay Foad if (skipFunction(F)) 319d85d143aSJay Foad return false; 320d85d143aSJay Foad 321d85d143aSJay Foad return imageIntrinsicOptimizerImpl(F, TM); 322d85d143aSJay Foad } 323d85d143aSJay Foad 324d85d143aSJay Foad FunctionPass * 325d85d143aSJay Foad llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { 326d85d143aSJay Foad return new AMDGPUImageIntrinsicOptimizer(TM); 327d85d143aSJay Foad } 328d85d143aSJay Foad 329d85d143aSJay Foad PreservedAnalyses 330d85d143aSJay Foad AMDGPUImageIntrinsicOptimizerPass::run(Function &F, 331d85d143aSJay Foad FunctionAnalysisManager &AM) { 332d85d143aSJay Foad 333d85d143aSJay Foad bool Changed = imageIntrinsicOptimizerImpl(F, &TM); 334d85d143aSJay Foad return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 335d85d143aSJay Foad } 336