1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "R600Subtarget.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 #include "llvm/CodeGen/TargetFrameLowering.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/IntrinsicsR600.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} 36 37 bool AMDGPUSubtarget::useRealTrue16Insts() const { 38 return hasTrue16BitInsts() && EnableRealTrue16Insts; 39 } 40 41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still 42 // allows the given function to achieve an occupancy of NWaves waves per 43 // SIMD / EU, taking into account only the function's *maximum* workgroup size. 44 unsigned 45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 46 const Function &F) const { 47 const unsigned WaveSize = getWavefrontSize(); 48 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 49 const unsigned WavesPerWorkgroup = 50 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); 51 52 const unsigned WorkGroupsPerCU = 53 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); 54 55 return getLocalMemorySize() / WorkGroupsPerCU; 56 } 57 58 std::pair<unsigned, unsigned> 59 AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, 60 const Function &F) const { 61 // FIXME: We should take into account the LDS allocation granularity. 62 const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); 63 64 // Queried LDS size may be larger than available on a CU, in which case we 65 // consider the only achievable occupancy to be 1, in line with what we 66 // consider the occupancy to be when the number of requested registers in a 67 // particular bank is higher than the number of available ones in that bank. 68 if (!MaxWGsLDS) 69 return {1, 1}; 70 71 const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); 72 73 auto PropsFromWGSize = [=](unsigned WGSize) 74 -> std::tuple<const unsigned, const unsigned, unsigned> { 75 unsigned WavesPerWG = divideCeil(WGSize, WaveSize); 76 unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); 77 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; 78 }; 79 80 // The maximum group size will generally yield the minimum number of 81 // workgroups, maximum number of waves, and minimum occupancy. The opposite is 82 // generally true for the minimum group size. LDS or barrier ressource 83 // limitations can flip those minimums/maximums. 84 const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); 85 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); 86 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); 87 88 // It is possible that we end up with flipped minimum and maximum number of 89 // waves per CU when the number of minimum/maximum concurrent groups on the CU 90 // is limited by LDS usage or barrier resources. 91 if (MinWavesPerCU >= MaxWavesPerCU) { 92 std::swap(MinWavesPerCU, MaxWavesPerCU); 93 } else { 94 const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); 95 96 // Look for a potential smaller group size than the maximum which decreases 97 // the concurrent number of waves on the CU for the same number of 98 // concurrent workgroups on the CU. 99 unsigned MinWavesPerCUForWGSize = 100 divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU; 101 if (MinWavesPerCU > MinWavesPerCUForWGSize) { 102 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; 103 if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { 104 // There may exist a smaller group size than the maximum that achieves 105 // the minimum number of waves per CU. This group size is the largest 106 // possible size that requires MaxWavesPerWG - E waves where E is 107 // maximized under the following constraints. 108 // 1. 0 <= E <= ExcessSlotsPerWG 109 // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize 110 MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG, 111 MaxWavesPerWG - MinWavesPerWG); 112 } 113 } 114 115 // Look for a potential larger group size than the minimum which increases 116 // the concurrent number of waves on the CU for the same number of 117 // concurrent workgroups on the CU. 118 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; 119 if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { 120 // There may exist a larger group size than the minimum that achieves the 121 // maximum number of waves per CU. This group size is the smallest 122 // possible size that requires MinWavesPerWG + L waves where L is 123 // maximized under the following constraints. 124 // 1. 0 <= L <= LeftoverSlotsPerWG 125 // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize 126 MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG, 127 ((MaxWGSize - 1) / WaveSize) + 1 - 128 MinWavesPerWG); 129 } 130 } 131 132 // Return the minimum/maximum number of waves on any EU, assuming that all 133 // wavefronts are spread across all EUs as evenly as possible. 134 return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU), 135 std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)}; 136 } 137 138 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( 139 const MachineFunction &MF) const { 140 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 141 return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); 142 } 143 144 std::pair<unsigned, unsigned> 145 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 146 switch (CC) { 147 case CallingConv::AMDGPU_VS: 148 case CallingConv::AMDGPU_LS: 149 case CallingConv::AMDGPU_HS: 150 case CallingConv::AMDGPU_ES: 151 case CallingConv::AMDGPU_GS: 152 case CallingConv::AMDGPU_PS: 153 return std::pair(1, getWavefrontSize()); 154 default: 155 return std::pair(1u, getMaxFlatWorkGroupSize()); 156 } 157 } 158 159 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 160 const Function &F) const { 161 // Default minimum/maximum flat work group sizes. 162 std::pair<unsigned, unsigned> Default = 163 getDefaultFlatWorkGroupSize(F.getCallingConv()); 164 165 // Requested minimum/maximum flat work group sizes. 166 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 167 F, "amdgpu-flat-work-group-size", Default); 168 169 // Make sure requested minimum is less than requested maximum. 170 if (Requested.first > Requested.second) 171 return Default; 172 173 // Make sure requested values do not violate subtarget's specifications. 174 if (Requested.first < getMinFlatWorkGroupSize()) 175 return Default; 176 if (Requested.second > getMaxFlatWorkGroupSize()) 177 return Default; 178 179 return Requested; 180 } 181 182 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( 183 std::pair<unsigned, unsigned> Requested, 184 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 185 // Default minimum/maximum number of waves per execution unit. 186 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 187 188 // If minimum/maximum flat work group sizes were explicitly requested using 189 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum 190 // number of waves per execution unit to values implied by requested 191 // minimum/maximum flat work group sizes. 192 unsigned MinImpliedByFlatWorkGroupSize = 193 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 194 Default.first = MinImpliedByFlatWorkGroupSize; 195 196 // Make sure requested minimum is less than requested maximum. 197 if (Requested.second && Requested.first > Requested.second) 198 return Default; 199 200 // Make sure requested values do not violate subtarget's specifications. 201 if (Requested.first < getMinWavesPerEU() || 202 Requested.second > getMaxWavesPerEU()) 203 return Default; 204 205 // Make sure requested values are compatible with values implied by requested 206 // minimum/maximum flat work group sizes. 207 if (Requested.first < MinImpliedByFlatWorkGroupSize) 208 return Default; 209 210 return Requested; 211 } 212 213 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 214 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 215 // Default minimum/maximum number of waves per execution unit. 216 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 217 218 // Requested minimum/maximum number of waves per execution unit. 219 std::pair<unsigned, unsigned> Requested = 220 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); 221 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); 222 } 223 224 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 225 auto *Node = Kernel.getMetadata("reqd_work_group_size"); 226 if (Node && Node->getNumOperands() == 3) 227 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 228 return std::numeric_limits<unsigned>::max(); 229 } 230 231 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 232 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 233 } 234 235 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 236 unsigned Dimension) const { 237 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 238 if (ReqdSize != std::numeric_limits<unsigned>::max()) 239 return ReqdSize - 1; 240 return getFlatWorkGroupSizes(Kernel).second - 1; 241 } 242 243 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { 244 for (int I = 0; I < 3; ++I) { 245 if (getMaxWorkitemID(Func, I) > 0) 246 return false; 247 } 248 249 return true; 250 } 251 252 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 253 Function *Kernel = I->getParent()->getParent(); 254 unsigned MinSize = 0; 255 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 256 bool IdQuery = false; 257 258 // If reqd_work_group_size is present it narrows value down. 259 if (auto *CI = dyn_cast<CallInst>(I)) { 260 const Function *F = CI->getCalledFunction(); 261 if (F) { 262 unsigned Dim = UINT_MAX; 263 switch (F->getIntrinsicID()) { 264 case Intrinsic::amdgcn_workitem_id_x: 265 case Intrinsic::r600_read_tidig_x: 266 IdQuery = true; 267 [[fallthrough]]; 268 case Intrinsic::r600_read_local_size_x: 269 Dim = 0; 270 break; 271 case Intrinsic::amdgcn_workitem_id_y: 272 case Intrinsic::r600_read_tidig_y: 273 IdQuery = true; 274 [[fallthrough]]; 275 case Intrinsic::r600_read_local_size_y: 276 Dim = 1; 277 break; 278 case Intrinsic::amdgcn_workitem_id_z: 279 case Intrinsic::r600_read_tidig_z: 280 IdQuery = true; 281 [[fallthrough]]; 282 case Intrinsic::r600_read_local_size_z: 283 Dim = 2; 284 break; 285 default: 286 break; 287 } 288 289 if (Dim <= 3) { 290 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 291 if (ReqdSize != std::numeric_limits<unsigned>::max()) 292 MinSize = MaxSize = ReqdSize; 293 } 294 } 295 } 296 297 if (!MaxSize) 298 return false; 299 300 // Range metadata is [Lo, Hi). For ID query we need to pass max size 301 // as Hi. For size query we need to pass Hi + 1. 302 if (IdQuery) 303 MinSize = 0; 304 else 305 ++MaxSize; 306 307 APInt Lower{32, MinSize}; 308 APInt Upper{32, MaxSize}; 309 if (auto *CI = dyn_cast<CallBase>(I)) { 310 ConstantRange Range(Lower, Upper); 311 CI->addRangeRetAttr(Range); 312 } else { 313 MDBuilder MDB(I->getContext()); 314 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper); 315 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 316 } 317 return true; 318 } 319 320 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 321 assert(AMDGPU::isKernel(F.getCallingConv())); 322 323 // We don't allocate the segment if we know the implicit arguments weren't 324 // used, even if the ABI implies we need them. 325 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 326 return 0; 327 328 if (isMesaKernel(F)) 329 return 16; 330 331 // Assume all implicit inputs are used by default 332 const Module *M = F.getParent(); 333 unsigned NBytes = 334 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; 335 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", 336 NBytes); 337 } 338 339 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 340 Align &MaxAlign) const { 341 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 342 F.getCallingConv() == CallingConv::SPIR_KERNEL); 343 344 const DataLayout &DL = F.getDataLayout(); 345 uint64_t ExplicitArgBytes = 0; 346 MaxAlign = Align(1); 347 348 for (const Argument &Arg : F.args()) { 349 if (Arg.hasAttribute("amdgpu-hidden-argument")) 350 continue; 351 352 const bool IsByRef = Arg.hasByRefAttr(); 353 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 354 Align Alignment = DL.getValueOrABITypeAlignment( 355 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy); 356 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 357 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 358 MaxAlign = std::max(MaxAlign, Alignment); 359 } 360 361 return ExplicitArgBytes; 362 } 363 364 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 365 Align &MaxAlign) const { 366 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && 367 F.getCallingConv() != CallingConv::SPIR_KERNEL) 368 return 0; 369 370 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 371 372 unsigned ExplicitOffset = getExplicitKernelArgOffset(); 373 374 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 375 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 376 if (ImplicitBytes != 0) { 377 const Align Alignment = getAlignmentForImplicitArgPtr(); 378 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 379 MaxAlign = std::max(MaxAlign, Alignment); 380 } 381 382 // Being able to dereference past the end is useful for emitting scalar loads. 383 return alignTo(TotalSize, 4); 384 } 385 386 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 387 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 388 : AMDGPUDwarfFlavour::Wave64; 389 } 390 391 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 392 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 393 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 394 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); 395 } 396 397 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 398 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 399 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 400 return static_cast<const AMDGPUSubtarget &>( 401 TM.getSubtarget<R600Subtarget>(F)); 402 } 403 404 // FIXME: This has no reason to be in subtarget 405 SmallVector<unsigned> 406 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { 407 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, 408 std::numeric_limits<uint32_t>::max()); 409 } 410