1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass replaces accesses to kernel arguments with loads from 10 /// offsets from the kernarg base pointer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/CodeGen/TargetPassConfig.h" 18 #include "llvm/IR/Attributes.h" 19 #include "llvm/IR/IRBuilder.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/IR/MDBuilder.h" 22 #include "llvm/Target/TargetMachine.h" 23 24 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" 25 26 using namespace llvm; 27 28 namespace { 29 30 class PreloadKernelArgInfo { 31 private: 32 Function &F; 33 const GCNSubtarget &ST; 34 unsigned NumFreeUserSGPRs; 35 36 enum HiddenArg : unsigned { 37 HIDDEN_BLOCK_COUNT_X, 38 HIDDEN_BLOCK_COUNT_Y, 39 HIDDEN_BLOCK_COUNT_Z, 40 HIDDEN_GROUP_SIZE_X, 41 HIDDEN_GROUP_SIZE_Y, 42 HIDDEN_GROUP_SIZE_Z, 43 HIDDEN_REMAINDER_X, 44 HIDDEN_REMAINDER_Y, 45 HIDDEN_REMAINDER_Z, 46 END_HIDDEN_ARGS 47 }; 48 49 // Stores information about a specific hidden argument. 50 struct HiddenArgInfo { 51 // Offset in bytes from the location in the kernearg segment pointed to by 52 // the implicitarg pointer. 53 uint8_t Offset; 54 // The size of the hidden argument in bytes. 55 uint8_t Size; 56 // The name of the hidden argument in the kernel signature. 57 const char *Name; 58 }; 59 60 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { 61 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, 62 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, 63 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, 64 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, 65 {22, 2, "_hidden_remainder_z"}}; 66 67 static HiddenArg getHiddenArgFromOffset(unsigned Offset) { 68 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) 69 if (HiddenArgs[I].Offset == Offset) 70 return static_cast<HiddenArg>(I); 71 72 return END_HIDDEN_ARGS; 73 } 74 75 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { 76 if (HA < END_HIDDEN_ARGS) 77 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); 78 79 llvm_unreachable("Unexpected hidden argument."); 80 } 81 82 static const char *getHiddenArgName(HiddenArg HA) { 83 if (HA < END_HIDDEN_ARGS) { 84 return HiddenArgs[HA].Name; 85 } 86 llvm_unreachable("Unexpected hidden argument."); 87 } 88 89 // Clones the function after adding implicit arguments to the argument list 90 // and returns the new updated function. Preloaded implicit arguments are 91 // added up to and including the last one that will be preloaded, indicated by 92 // LastPreloadIndex. Currently preloading is only performed on the totality of 93 // sequential data from the kernarg segment including implicit (hidden) 94 // arguments. This means that all arguments up to the last preloaded argument 95 // will also be preloaded even if that data is unused. 96 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { 97 FunctionType *FT = F.getFunctionType(); 98 LLVMContext &Ctx = F.getParent()->getContext(); 99 SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); 100 for (unsigned I = 0; I <= LastPreloadIndex; ++I) 101 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); 102 103 FunctionType *NFT = 104 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); 105 Function *NF = 106 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); 107 108 NF->copyAttributesFrom(&F); 109 NF->copyMetadata(&F, 0); 110 NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); 111 112 F.getParent()->getFunctionList().insert(F.getIterator(), NF); 113 NF->takeName(&F); 114 NF->splice(NF->begin(), &F); 115 116 Function::arg_iterator NFArg = NF->arg_begin(); 117 for (Argument &Arg : F.args()) { 118 Arg.replaceAllUsesWith(&*NFArg); 119 NFArg->takeName(&Arg); 120 ++NFArg; 121 } 122 123 AttrBuilder AB(Ctx); 124 AB.addAttribute(Attribute::InReg); 125 AB.addAttribute("amdgpu-hidden-argument"); 126 AttributeList AL = NF->getAttributes(); 127 for (unsigned I = 0; I <= LastPreloadIndex; ++I) { 128 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); 129 NFArg++->setName(getHiddenArgName(HiddenArg(I))); 130 } 131 132 NF->setAttributes(AL); 133 F.replaceAllUsesWith(NF); 134 F.setCallingConv(CallingConv::C); 135 136 return NF; 137 } 138 139 public: 140 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { 141 setInitialFreeUserSGPRsCount(); 142 } 143 144 // Returns the maximum number of user SGPRs that we have available to preload 145 // arguments. 146 void setInitialFreeUserSGPRsCount() { 147 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); 148 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); 149 } 150 151 bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, 152 uint64_t LastExplicitArgOffset) { 153 // Check if this argument may be loaded into the same register as the 154 // previous argument. 155 if (ArgOffset - LastExplicitArgOffset < 4 && 156 !isAligned(Align(4), ArgOffset)) 157 return true; 158 159 // Pad SGPRs for kernarg alignment. 160 ArgOffset = alignDown(ArgOffset, 4); 161 unsigned Padding = ArgOffset - LastExplicitArgOffset; 162 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; 163 unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; 164 if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) 165 return false; 166 167 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); 168 return true; 169 } 170 171 // Try to allocate SGPRs to preload implicit kernel arguments. 172 void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, 173 uint64_t LastExplicitArgOffset, 174 IRBuilder<> &Builder) { 175 Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( 176 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); 177 if (!ImplicitArgPtr) 178 return; 179 180 const DataLayout &DL = F.getParent()->getDataLayout(); 181 // Pair is the load and the load offset. 182 SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; 183 for (auto *U : ImplicitArgPtr->users()) { 184 Instruction *CI = dyn_cast<Instruction>(U); 185 if (!CI || CI->getParent()->getParent() != &F) 186 continue; 187 188 for (auto *U : CI->users()) { 189 int64_t Offset = 0; 190 auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr? 191 if (!Load) { 192 if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) 193 continue; 194 195 Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? 196 } 197 198 if (!Load || !Load->isSimple()) 199 continue; 200 201 // FIXME: Expand to handle 64-bit implicit args and large merged loads. 202 LLVMContext &Ctx = F.getParent()->getContext(); 203 Type *LoadTy = Load->getType(); 204 HiddenArg HA = getHiddenArgFromOffset(Offset); 205 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) 206 continue; 207 208 ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); 209 } 210 } 211 212 if (ImplicitArgLoads.empty()) 213 return; 214 215 // Allocate loads in order of offset. We need to be sure that the implicit 216 // argument can actually be preloaded. 217 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); 218 219 // If we fail to preload any implicit argument we know we don't have SGPRs 220 // to preload any subsequent ones with larger offsets. Find the first 221 // argument that we cannot preload. 222 auto *PreloadEnd = std::find_if( 223 ImplicitArgLoads.begin(), ImplicitArgLoads.end(), 224 [&](const std::pair<LoadInst *, unsigned> &Load) { 225 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); 226 unsigned LoadOffset = Load.second; 227 if (!tryAllocPreloadSGPRs(LoadSize, 228 LoadOffset + ImplicitArgsBaseOffset, 229 LastExplicitArgOffset)) 230 return true; 231 232 LastExplicitArgOffset = 233 ImplicitArgsBaseOffset + LoadOffset + LoadSize; 234 return false; 235 }); 236 237 if (PreloadEnd == ImplicitArgLoads.begin()) 238 return; 239 240 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); 241 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); 242 assert(NF); 243 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { 244 LoadInst *LoadInst = I->first; 245 unsigned LoadOffset = I->second; 246 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); 247 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; 248 Argument *Arg = NF->getArg(Index); 249 LoadInst->replaceAllUsesWith(Arg); 250 } 251 } 252 }; 253 254 class AMDGPULowerKernelArguments : public FunctionPass { 255 public: 256 static char ID; 257 258 AMDGPULowerKernelArguments() : FunctionPass(ID) {} 259 260 bool runOnFunction(Function &F) override; 261 262 void getAnalysisUsage(AnalysisUsage &AU) const override { 263 AU.addRequired<TargetPassConfig>(); 264 AU.setPreservesAll(); 265 } 266 }; 267 268 } // end anonymous namespace 269 270 // skip allocas 271 static BasicBlock::iterator getInsertPt(BasicBlock &BB) { 272 BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); 273 for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { 274 AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt); 275 276 // If this is a dynamic alloca, the value may depend on the loaded kernargs, 277 // so loads will need to be inserted before it. 278 if (!AI || !AI->isStaticAlloca()) 279 break; 280 } 281 282 return InsPt; 283 } 284 285 static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { 286 CallingConv::ID CC = F.getCallingConv(); 287 if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) 288 return false; 289 290 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 291 LLVMContext &Ctx = F.getParent()->getContext(); 292 const DataLayout &DL = F.getDataLayout(); 293 BasicBlock &EntryBlock = *F.begin(); 294 IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); 295 296 const Align KernArgBaseAlign(16); // FIXME: Increase if necessary 297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); 298 299 Align MaxAlign; 300 // FIXME: Alignment is broken with explicit arg offset.; 301 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); 302 if (TotalKernArgSize == 0) 303 return false; 304 305 CallInst *KernArgSegment = 306 Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, 307 nullptr, F.getName() + ".kernarg.segment"); 308 KernArgSegment->addRetAttr(Attribute::NonNull); 309 KernArgSegment->addRetAttr( 310 Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); 311 312 uint64_t ExplicitArgOffset = 0; 313 // Preloaded kernel arguments must be sequential. 314 bool InPreloadSequence = true; 315 PreloadKernelArgInfo PreloadInfo(F, ST); 316 317 for (Argument &Arg : F.args()) { 318 const bool IsByRef = Arg.hasByRefAttr(); 319 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 320 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; 321 Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); 322 323 uint64_t Size = DL.getTypeSizeInBits(ArgTy); 324 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 325 326 uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; 327 uint64_t LastExplicitArgOffset = ExplicitArgOffset; 328 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; 329 330 // Guard against the situation where hidden arguments have already been 331 // lowered and added to the kernel function signiture, i.e. in a situation 332 // where this pass has run twice. 333 if (Arg.hasAttribute("amdgpu-hidden-argument")) 334 break; 335 336 // Try to preload this argument into user SGPRs. 337 if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && 338 !Arg.getType()->isAggregateType()) 339 if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, 340 LastExplicitArgOffset)) 341 continue; 342 343 InPreloadSequence = false; 344 345 if (Arg.use_empty()) 346 continue; 347 348 // If this is byval, the loads are already explicit in the function. We just 349 // need to rewrite the pointer values. 350 if (IsByRef) { 351 Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( 352 Builder.getInt8Ty(), KernArgSegment, EltOffset, 353 Arg.getName() + ".byval.kernarg.offset"); 354 355 Value *CastOffsetPtr = 356 Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType()); 357 Arg.replaceAllUsesWith(CastOffsetPtr); 358 continue; 359 } 360 361 if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) { 362 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing 363 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We 364 // can't represent this with range metadata because it's only allowed for 365 // integer types. 366 if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 367 PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && 368 !ST.hasUsableDSOffset()) 369 continue; 370 371 // FIXME: We can replace this with equivalent alias.scope/noalias 372 // metadata, but this appears to be a lot of work. 373 if (Arg.hasNoAliasAttr()) 374 continue; 375 } 376 377 auto *VT = dyn_cast<FixedVectorType>(ArgTy); 378 bool IsV3 = VT && VT->getNumElements() == 3; 379 bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); 380 381 VectorType *V4Ty = nullptr; 382 383 int64_t AlignDownOffset = alignDown(EltOffset, 4); 384 int64_t OffsetDiff = EltOffset - AlignDownOffset; 385 Align AdjustedAlign = commonAlignment( 386 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); 387 388 Value *ArgPtr; 389 Type *AdjustedArgTy; 390 if (DoShiftOpt) { // FIXME: Handle aggregate types 391 // Since we don't have sub-dword scalar loads, avoid doing an extload by 392 // loading earlier than the argument address, and extracting the relevant 393 // bits. 394 // TODO: Update this for GFX12 which does have scalar sub-dword loads. 395 // 396 // Additionally widen any sub-dword load to i32 even if suitably aligned, 397 // so that CSE between different argument loads works easily. 398 ArgPtr = Builder.CreateConstInBoundsGEP1_64( 399 Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, 400 Arg.getName() + ".kernarg.offset.align.down"); 401 AdjustedArgTy = Builder.getInt32Ty(); 402 } else { 403 ArgPtr = Builder.CreateConstInBoundsGEP1_64( 404 Builder.getInt8Ty(), KernArgSegment, EltOffset, 405 Arg.getName() + ".kernarg.offset"); 406 AdjustedArgTy = ArgTy; 407 } 408 409 if (IsV3 && Size >= 32) { 410 V4Ty = FixedVectorType::get(VT->getElementType(), 4); 411 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads 412 AdjustedArgTy = V4Ty; 413 } 414 415 LoadInst *Load = 416 Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); 417 Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); 418 419 MDBuilder MDB(Ctx); 420 421 if (Arg.hasAttribute(Attribute::NoUndef)) 422 Load->setMetadata(LLVMContext::MD_noundef, MDNode::get(Ctx, {})); 423 424 if (Arg.hasAttribute(Attribute::Range)) { 425 const ConstantRange &Range = 426 Arg.getAttribute(Attribute::Range).getValueAsConstantRange(); 427 Load->setMetadata(LLVMContext::MD_range, 428 MDB.createRange(Range.getLower(), Range.getUpper())); 429 } 430 431 if (isa<PointerType>(ArgTy)) { 432 if (Arg.hasNonNullAttr()) 433 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); 434 435 uint64_t DerefBytes = Arg.getDereferenceableBytes(); 436 if (DerefBytes != 0) { 437 Load->setMetadata( 438 LLVMContext::MD_dereferenceable, 439 MDNode::get(Ctx, 440 MDB.createConstant( 441 ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); 442 } 443 444 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); 445 if (DerefOrNullBytes != 0) { 446 Load->setMetadata( 447 LLVMContext::MD_dereferenceable_or_null, 448 MDNode::get(Ctx, 449 MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), 450 DerefOrNullBytes)))); 451 } 452 453 if (MaybeAlign ParamAlign = Arg.getParamAlign()) { 454 Load->setMetadata( 455 LLVMContext::MD_align, 456 MDNode::get(Ctx, MDB.createConstant(ConstantInt::get( 457 Builder.getInt64Ty(), ParamAlign->value())))); 458 } 459 } 460 461 // TODO: Convert noalias arg to !noalias 462 463 if (DoShiftOpt) { 464 Value *ExtractBits = OffsetDiff == 0 ? 465 Load : Builder.CreateLShr(Load, OffsetDiff * 8); 466 467 IntegerType *ArgIntTy = Builder.getIntNTy(Size); 468 Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); 469 Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, 470 Arg.getName() + ".load"); 471 Arg.replaceAllUsesWith(NewVal); 472 } else if (IsV3) { 473 Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2}, 474 Arg.getName() + ".load"); 475 Arg.replaceAllUsesWith(Shuf); 476 } else { 477 Load->setName(Arg.getName() + ".load"); 478 Arg.replaceAllUsesWith(Load); 479 } 480 } 481 482 KernArgSegment->addRetAttr( 483 Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); 484 485 if (InPreloadSequence) { 486 uint64_t ImplicitArgsBaseOffset = 487 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + 488 BaseOffset; 489 PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, 490 ExplicitArgOffset, Builder); 491 } 492 493 return true; 494 } 495 496 bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { 497 auto &TPC = getAnalysis<TargetPassConfig>(); 498 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 499 return lowerKernelArguments(F, TM); 500 } 501 502 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, 503 "AMDGPU Lower Kernel Arguments", false, false) 504 INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", 505 false, false) 506 507 char AMDGPULowerKernelArguments::ID = 0; 508 509 FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { 510 return new AMDGPULowerKernelArguments(); 511 } 512 513 PreservedAnalyses 514 AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { 515 bool Changed = lowerKernelArguments(F, TM); 516 if (Changed) { 517 // TODO: Preserves a lot more. 518 PreservedAnalyses PA; 519 PA.preserveSet<CFGAnalyses>(); 520 return PA; 521 } 522 523 return PreservedAnalyses::all(); 524 } 525