1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass does attempts to make use of reqd_work_group_size metadata 10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL 11 /// get_local_size-like functions. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/Analysis/ConstantFolding.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/Constants.h" 21 #include "llvm/IR/Function.h" 22 #include "llvm/IR/InstIterator.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 #include "llvm/IR/MDBuilder.h" 26 #include "llvm/IR/PatternMatch.h" 27 #include "llvm/Pass.h" 28 29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes" 30 31 using namespace llvm; 32 33 namespace { 34 35 // Field offsets in hsa_kernel_dispatch_packet_t. 36 enum DispatchPackedOffsets { 37 WORKGROUP_SIZE_X = 4, 38 WORKGROUP_SIZE_Y = 6, 39 WORKGROUP_SIZE_Z = 8, 40 41 GRID_SIZE_X = 12, 42 GRID_SIZE_Y = 16, 43 GRID_SIZE_Z = 20 44 }; 45 46 // Field offsets to implicit kernel argument pointer. 47 enum ImplicitArgOffsets { 48 HIDDEN_BLOCK_COUNT_X = 0, 49 HIDDEN_BLOCK_COUNT_Y = 4, 50 HIDDEN_BLOCK_COUNT_Z = 8, 51 52 HIDDEN_GROUP_SIZE_X = 12, 53 HIDDEN_GROUP_SIZE_Y = 14, 54 HIDDEN_GROUP_SIZE_Z = 16, 55 56 HIDDEN_REMAINDER_X = 18, 57 HIDDEN_REMAINDER_Y = 20, 58 HIDDEN_REMAINDER_Z = 22, 59 }; 60 61 class AMDGPULowerKernelAttributes : public ModulePass { 62 public: 63 static char ID; 64 65 AMDGPULowerKernelAttributes() : ModulePass(ID) {} 66 67 bool runOnModule(Module &M) override; 68 69 StringRef getPassName() const override { 70 return "AMDGPU Kernel Attributes"; 71 } 72 73 void getAnalysisUsage(AnalysisUsage &AU) const override { 74 AU.setPreservesAll(); 75 } 76 }; 77 78 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { 79 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr 80 : Intrinsic::amdgcn_dispatch_ptr; 81 return Intrinsic::getDeclarationIfExists(&M, IntrinsicId); 82 } 83 84 } // end anonymous namespace 85 86 static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, 87 uint32_t MaxNumGroups) { 88 if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) 89 return; 90 91 if (!Load->getType()->isIntegerTy(32)) 92 return; 93 94 // TODO: If there is existing range metadata, preserve it if it is stricter. 95 MDBuilder MDB(Load->getContext()); 96 MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); 97 Load->setMetadata(LLVMContext::MD_range, Range); 98 } 99 100 static bool processUse(CallInst *CI, bool IsV5OrAbove) { 101 Function *F = CI->getParent()->getParent(); 102 103 auto *MD = F->getMetadata("reqd_work_group_size"); 104 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; 105 106 const bool HasUniformWorkGroupSize = 107 F->getFnAttribute("uniform-work-group-size").getValueAsBool(); 108 109 SmallVector<unsigned> MaxNumWorkgroups = 110 AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3); 111 112 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && 113 none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) 114 return false; 115 116 Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; 117 Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; 118 Value *Remainders[3] = {nullptr, nullptr, nullptr}; 119 Value *GridSizes[3] = {nullptr, nullptr, nullptr}; 120 121 const DataLayout &DL = F->getDataLayout(); 122 123 // We expect to see several GEP users, casted to the appropriate type and 124 // loaded. 125 for (User *U : CI->users()) { 126 if (!U->hasOneUse()) 127 continue; 128 129 int64_t Offset = 0; 130 auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? 131 auto *BCI = dyn_cast<BitCastInst>(U); 132 if (!Load && !BCI) { 133 if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) 134 continue; 135 Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? 136 BCI = dyn_cast<BitCastInst>(*U->user_begin()); 137 } 138 139 if (BCI) { 140 if (!BCI->hasOneUse()) 141 continue; 142 Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? 143 } 144 145 if (!Load || !Load->isSimple()) 146 continue; 147 148 unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); 149 150 // TODO: Handle merged loads. 151 if (IsV5OrAbove) { // Base is ImplicitArgPtr. 152 switch (Offset) { 153 case HIDDEN_BLOCK_COUNT_X: 154 if (LoadSize == 4) { 155 BlockCounts[0] = Load; 156 annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); 157 } 158 break; 159 case HIDDEN_BLOCK_COUNT_Y: 160 if (LoadSize == 4) { 161 BlockCounts[1] = Load; 162 annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); 163 } 164 break; 165 case HIDDEN_BLOCK_COUNT_Z: 166 if (LoadSize == 4) { 167 BlockCounts[2] = Load; 168 annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); 169 } 170 break; 171 case HIDDEN_GROUP_SIZE_X: 172 if (LoadSize == 2) 173 GroupSizes[0] = Load; 174 break; 175 case HIDDEN_GROUP_SIZE_Y: 176 if (LoadSize == 2) 177 GroupSizes[1] = Load; 178 break; 179 case HIDDEN_GROUP_SIZE_Z: 180 if (LoadSize == 2) 181 GroupSizes[2] = Load; 182 break; 183 case HIDDEN_REMAINDER_X: 184 if (LoadSize == 2) 185 Remainders[0] = Load; 186 break; 187 case HIDDEN_REMAINDER_Y: 188 if (LoadSize == 2) 189 Remainders[1] = Load; 190 break; 191 case HIDDEN_REMAINDER_Z: 192 if (LoadSize == 2) 193 Remainders[2] = Load; 194 break; 195 default: 196 break; 197 } 198 } else { // Base is DispatchPtr. 199 switch (Offset) { 200 case WORKGROUP_SIZE_X: 201 if (LoadSize == 2) 202 GroupSizes[0] = Load; 203 break; 204 case WORKGROUP_SIZE_Y: 205 if (LoadSize == 2) 206 GroupSizes[1] = Load; 207 break; 208 case WORKGROUP_SIZE_Z: 209 if (LoadSize == 2) 210 GroupSizes[2] = Load; 211 break; 212 case GRID_SIZE_X: 213 if (LoadSize == 4) 214 GridSizes[0] = Load; 215 break; 216 case GRID_SIZE_Y: 217 if (LoadSize == 4) 218 GridSizes[1] = Load; 219 break; 220 case GRID_SIZE_Z: 221 if (LoadSize == 4) 222 GridSizes[2] = Load; 223 break; 224 default: 225 break; 226 } 227 } 228 } 229 230 bool MadeChange = false; 231 if (IsV5OrAbove && HasUniformWorkGroupSize) { 232 // Under v5 __ockl_get_local_size returns the value computed by the expression: 233 // 234 // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder 235 // 236 // For functions with the attribute uniform-work-group-size=true. we can evaluate 237 // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned 238 // for __ockl_get_local_size. 239 for (int I = 0; I < 3; ++I) { 240 Value *BlockCount = BlockCounts[I]; 241 if (!BlockCount) 242 continue; 243 244 using namespace llvm::PatternMatch; 245 auto GroupIDIntrin = 246 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() 247 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() 248 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); 249 250 for (User *ICmp : BlockCount->users()) { 251 if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin, 252 m_Specific(BlockCount)))) { 253 ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); 254 MadeChange = true; 255 } 256 } 257 } 258 259 // All remainders should be 0 with uniform work group size. 260 for (Value *Remainder : Remainders) { 261 if (!Remainder) 262 continue; 263 Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); 264 MadeChange = true; 265 } 266 } else if (HasUniformWorkGroupSize) { // Pre-V5. 267 // Pattern match the code used to handle partial workgroup dispatches in the 268 // library implementation of get_local_size, so the entire function can be 269 // constant folded with a known group size. 270 // 271 // uint r = grid_size - group_id * group_size; 272 // get_local_size = (r < group_size) ? r : group_size; 273 // 274 // If we have uniform-work-group-size (which is the default in OpenCL 1.2), 275 // the grid_size is required to be a multiple of group_size). In this case: 276 // 277 // grid_size - (group_id * group_size) < group_size 278 // -> 279 // grid_size < group_size + (group_id * group_size) 280 // 281 // (grid_size / group_size) < 1 + group_id 282 // 283 // grid_size / group_size is at least 1, so we can conclude the select 284 // condition is false (except for group_id == 0, where the select result is 285 // the same). 286 for (int I = 0; I < 3; ++I) { 287 Value *GroupSize = GroupSizes[I]; 288 Value *GridSize = GridSizes[I]; 289 if (!GroupSize || !GridSize) 290 continue; 291 292 using namespace llvm::PatternMatch; 293 auto GroupIDIntrin = 294 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() 295 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() 296 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); 297 298 for (User *U : GroupSize->users()) { 299 auto *ZextGroupSize = dyn_cast<ZExtInst>(U); 300 if (!ZextGroupSize) 301 continue; 302 303 for (User *UMin : ZextGroupSize->users()) { 304 if (match(UMin, 305 m_UMin(m_Sub(m_Specific(GridSize), 306 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), 307 m_Specific(ZextGroupSize)))) { 308 if (HasReqdWorkGroupSize) { 309 ConstantInt *KnownSize 310 = mdconst::extract<ConstantInt>(MD->getOperand(I)); 311 UMin->replaceAllUsesWith(ConstantFoldIntegerCast( 312 KnownSize, UMin->getType(), false, DL)); 313 } else { 314 UMin->replaceAllUsesWith(ZextGroupSize); 315 } 316 317 MadeChange = true; 318 } 319 } 320 } 321 } 322 } 323 324 // If reqd_work_group_size is set, we can replace work group size with it. 325 if (!HasReqdWorkGroupSize) 326 return MadeChange; 327 328 for (int I = 0; I < 3; I++) { 329 Value *GroupSize = GroupSizes[I]; 330 if (!GroupSize) 331 continue; 332 333 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); 334 GroupSize->replaceAllUsesWith( 335 ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL)); 336 MadeChange = true; 337 } 338 339 return MadeChange; 340 } 341 342 343 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get 344 // TargetPassConfig for subtarget. 345 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { 346 bool MadeChange = false; 347 bool IsV5OrAbove = 348 AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; 349 Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); 350 351 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. 352 return false; 353 354 SmallPtrSet<Instruction *, 4> HandledUses; 355 for (auto *U : BasePtr->users()) { 356 CallInst *CI = cast<CallInst>(U); 357 if (HandledUses.insert(CI).second) { 358 if (processUse(CI, IsV5OrAbove)) 359 MadeChange = true; 360 } 361 } 362 363 return MadeChange; 364 } 365 366 367 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, 368 "AMDGPU Kernel Attributes", false, false) 369 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, 370 "AMDGPU Kernel Attributes", false, false) 371 372 char AMDGPULowerKernelAttributes::ID = 0; 373 374 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { 375 return new AMDGPULowerKernelAttributes(); 376 } 377 378 PreservedAnalyses 379 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { 380 bool IsV5OrAbove = 381 AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; 382 Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); 383 384 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. 385 return PreservedAnalyses::all(); 386 387 for (Instruction &I : instructions(F)) { 388 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 389 if (CI->getCalledFunction() == BasePtr) 390 processUse(CI, IsV5OrAbove); 391 } 392 } 393 394 return PreservedAnalyses::all(); 395 } 396