1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUMemoryUtils.h" 10 #include "AMDGPU.h" 11 #include "Utils/AMDGPUBaseInfo.h" 12 #include "llvm/ADT/SetOperations.h" 13 #include "llvm/ADT/SmallSet.h" 14 #include "llvm/Analysis/AliasAnalysis.h" 15 #include "llvm/Analysis/CallGraph.h" 16 #include "llvm/Analysis/MemorySSA.h" 17 #include "llvm/IR/DataLayout.h" 18 #include "llvm/IR/Instructions.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/IR/ReplaceConstant.h" 22 23 #define DEBUG_TYPE "amdgpu-memory-utils" 24 25 using namespace llvm; 26 27 namespace llvm::AMDGPU { 28 29 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { 30 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), 31 GV->getValueType()); 32 } 33 34 TargetExtType *isNamedBarrier(const GlobalVariable &GV) { 35 // TODO: Allow arrays and structs, if all members are barriers 36 // in the same scope. 37 // TODO: Disallow other uses of target("amdgcn.named.barrier") including: 38 // - Structs containing barriers in different scope. 39 // - Structs containing a mixture of barriers and other data. 40 // - Globals in other address spaces. 41 // - Allocas. 42 Type *Ty = GV.getValueType(); 43 while (true) { 44 if (auto *TTy = dyn_cast<TargetExtType>(Ty)) 45 return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr; 46 if (auto *STy = dyn_cast<StructType>(Ty)) { 47 if (STy->getNumElements() == 0) 48 return nullptr; 49 Ty = STy->getElementType(0); 50 continue; 51 } 52 return nullptr; 53 } 54 } 55 56 bool isDynamicLDS(const GlobalVariable &GV) { 57 // external zero size addrspace(3) without initializer is dynlds. 58 const Module *M = GV.getParent(); 59 const DataLayout &DL = M->getDataLayout(); 60 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 61 return false; 62 return DL.getTypeAllocSize(GV.getValueType()) == 0; 63 } 64 65 bool isLDSVariableToLower(const GlobalVariable &GV) { 66 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 67 return false; 68 } 69 if (isDynamicLDS(GV)) { 70 return true; 71 } 72 if (GV.isConstant()) { 73 // A constant undef variable can't be written to, and any load is 74 // undef, so it should be eliminated by the optimizer. It could be 75 // dropped by the back end if not. This pass skips over it. 76 return false; 77 } 78 if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { 79 // Initializers are unimplemented for LDS address space. 80 // Leave such variables in place for consistent error reporting. 81 return false; 82 } 83 return true; 84 } 85 86 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { 87 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS 88 // global may have uses from multiple different functions as a result. 89 // This pass specialises LDS variables with respect to the kernel that 90 // allocates them. 91 92 // This is semantically equivalent to (the unimplemented as slow): 93 // for (auto &F : M.functions()) 94 // for (auto &BB : F) 95 // for (auto &I : BB) 96 // for (Use &Op : I.operands()) 97 // if (constantExprUsesLDS(Op)) 98 // replaceConstantExprInFunction(I, Op); 99 100 SmallVector<Constant *> LDSGlobals; 101 for (auto &GV : M.globals()) 102 if (AMDGPU::isLDSVariableToLower(GV)) 103 LDSGlobals.push_back(&GV); 104 return convertUsersOfConstantsToInstructions(LDSGlobals); 105 } 106 107 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, 108 FunctionVariableMap &kernels, 109 FunctionVariableMap &Functions) { 110 // Get uses from the current function, excluding uses by called Functions 111 // Two output variables to avoid walking the globals list twice 112 for (auto &GV : M.globals()) { 113 if (!AMDGPU::isLDSVariableToLower(GV)) 114 continue; 115 for (User *V : GV.users()) { 116 if (auto *I = dyn_cast<Instruction>(V)) { 117 Function *F = I->getFunction(); 118 if (isKernelLDS(F)) 119 kernels[F].insert(&GV); 120 else 121 Functions[F].insert(&GV); 122 } 123 } 124 } 125 } 126 127 bool isKernelLDS(const Function *F) { 128 // Some weirdness here. AMDGPU::isKernelCC does not call into 129 // AMDGPU::isKernel with the calling conv, it instead calls into 130 // isModuleEntryFunction which returns true for more calling conventions 131 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. 132 // There's also a test that checks that the LDS lowering does not hit on 133 // a graphics shader, denoted amdgpu_ps, so stay with the limited case. 134 // Putting LDS in the name of the function to draw attention to this. 135 return AMDGPU::isKernel(F->getCallingConv()); 136 } 137 138 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { 139 140 FunctionVariableMap DirectMapKernel; 141 FunctionVariableMap DirectMapFunction; 142 getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction); 143 144 // Collect functions whose address has escaped 145 DenseSet<Function *> AddressTakenFuncs; 146 for (Function &F : M.functions()) { 147 if (!isKernelLDS(&F)) 148 if (F.hasAddressTaken(nullptr, 149 /* IgnoreCallbackUses */ false, 150 /* IgnoreAssumeLikeCalls */ false, 151 /* IgnoreLLVMUsed */ true, 152 /* IgnoreArcAttachedCall */ false)) { 153 AddressTakenFuncs.insert(&F); 154 } 155 } 156 157 // Collect variables that are used by functions whose address has escaped 158 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; 159 for (Function *F : AddressTakenFuncs) { 160 set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]); 161 } 162 163 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { 164 assert(!F->isDeclaration()); 165 for (const CallGraphNode::CallRecord &R : *CG[F]) { 166 if (!R.second->getFunction()) 167 return true; 168 } 169 return false; 170 }; 171 172 // Work out which variables are reachable through function calls 173 FunctionVariableMap TransitiveMapFunction = DirectMapFunction; 174 175 // If the function makes any unknown call, assume the worst case that it can 176 // access all variables accessed by functions whose address escaped 177 for (Function &F : M.functions()) { 178 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { 179 if (!isKernelLDS(&F)) { 180 set_union(TransitiveMapFunction[&F], 181 VariablesReachableThroughFunctionPointer); 182 } 183 } 184 } 185 186 // Direct implementation of collecting all variables reachable from each 187 // function 188 for (Function &Func : M.functions()) { 189 if (Func.isDeclaration() || isKernelLDS(&Func)) 190 continue; 191 192 DenseSet<Function *> seen; // catches cycles 193 SmallVector<Function *, 4> wip = {&Func}; 194 195 while (!wip.empty()) { 196 Function *F = wip.pop_back_val(); 197 198 // Can accelerate this by referring to transitive map for functions that 199 // have already been computed, with more care than this 200 set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]); 201 202 for (const CallGraphNode::CallRecord &R : *CG[F]) { 203 Function *Ith = R.second->getFunction(); 204 if (Ith) { 205 if (!seen.contains(Ith)) { 206 seen.insert(Ith); 207 wip.push_back(Ith); 208 } 209 } 210 } 211 } 212 } 213 214 // Collect variables that are transitively used by functions whose address has 215 // escaped 216 for (Function *F : AddressTakenFuncs) { 217 set_union(VariablesReachableThroughFunctionPointer, 218 TransitiveMapFunction[F]); 219 } 220 221 // DirectMapKernel lists which variables are used by the kernel 222 // find the variables which are used through a function call 223 FunctionVariableMap IndirectMapKernel; 224 225 for (Function &Func : M.functions()) { 226 if (Func.isDeclaration() || !isKernelLDS(&Func)) 227 continue; 228 229 for (const CallGraphNode::CallRecord &R : *CG[&Func]) { 230 Function *Ith = R.second->getFunction(); 231 if (Ith) { 232 set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]); 233 } 234 } 235 236 // Check if the kernel encounters unknows calls, wheher directly or 237 // indirectly. 238 bool SeesUnknownCalls = [&]() { 239 SmallVector<Function *> WorkList = {CG[&Func]->getFunction()}; 240 SmallPtrSet<Function *, 8> Visited; 241 242 while (!WorkList.empty()) { 243 Function *F = WorkList.pop_back_val(); 244 245 for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) { 246 if (!CallRecord.second) 247 continue; 248 249 Function *Callee = CallRecord.second->getFunction(); 250 if (!Callee) 251 return true; 252 253 if (Visited.insert(Callee).second) 254 WorkList.push_back(Callee); 255 } 256 } 257 return false; 258 }(); 259 260 if (SeesUnknownCalls) { 261 set_union(IndirectMapKernel[&Func], 262 VariablesReachableThroughFunctionPointer); 263 } 264 } 265 266 // Verify that we fall into one of 2 cases: 267 // - All variables are either absolute 268 // or direct mapped dynamic LDS that is not lowered. 269 // this is a re-run of the pass 270 // so we don't have anything to do. 271 // - No variables are absolute. 272 std::optional<bool> HasAbsoluteGVs; 273 bool HasSpecialGVs = false; 274 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { 275 for (auto &[Fn, GVs] : Map) { 276 for (auto *GV : GVs) { 277 bool IsAbsolute = GV->isAbsoluteSymbolRef(); 278 bool IsDirectMapDynLDSGV = 279 AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn); 280 if (IsDirectMapDynLDSGV) 281 continue; 282 if (isNamedBarrier(*GV)) { 283 HasSpecialGVs = true; 284 continue; 285 } 286 if (HasAbsoluteGVs.has_value()) { 287 if (*HasAbsoluteGVs != IsAbsolute) { 288 report_fatal_error( 289 "Module cannot mix absolute and non-absolute LDS GVs"); 290 } 291 } else 292 HasAbsoluteGVs = IsAbsolute; 293 } 294 } 295 } 296 297 // If we only had absolute GVs, we have nothing to do, return an empty 298 // result. 299 if (HasAbsoluteGVs && *HasAbsoluteGVs) 300 return {FunctionVariableMap(), FunctionVariableMap(), false}; 301 302 return {std::move(DirectMapKernel), std::move(IndirectMapKernel), 303 HasSpecialGVs}; 304 } 305 306 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, 307 ArrayRef<StringRef> FnAttrs) { 308 for (StringRef Attr : FnAttrs) 309 KernelRoot->removeFnAttr(Attr); 310 311 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; 312 SmallPtrSet<Function *, 8> Visited; 313 bool SeenUnknownCall = false; 314 315 while (!WorkList.empty()) { 316 Function *F = WorkList.pop_back_val(); 317 318 for (auto &CallRecord : *CG[F]) { 319 if (!CallRecord.second) 320 continue; 321 322 Function *Callee = CallRecord.second->getFunction(); 323 if (!Callee) { 324 if (!SeenUnknownCall) { 325 SeenUnknownCall = true; 326 327 // If we see any indirect calls, assume nothing about potential 328 // targets. 329 // TODO: This could be refined to possible LDS global users. 330 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { 331 Function *PotentialCallee = 332 ExternalCallRecord.second->getFunction(); 333 assert(PotentialCallee); 334 if (!isKernelLDS(PotentialCallee)) { 335 for (StringRef Attr : FnAttrs) 336 PotentialCallee->removeFnAttr(Attr); 337 } 338 } 339 } 340 } else { 341 for (StringRef Attr : FnAttrs) 342 Callee->removeFnAttr(Attr); 343 if (Visited.insert(Callee).second) 344 WorkList.push_back(Callee); 345 } 346 } 347 } 348 } 349 350 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { 351 Instruction *DefInst = Def->getMemoryInst(); 352 353 if (isa<FenceInst>(DefInst)) 354 return false; 355 356 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { 357 switch (II->getIntrinsicID()) { 358 case Intrinsic::amdgcn_s_barrier: 359 case Intrinsic::amdgcn_s_barrier_signal: 360 case Intrinsic::amdgcn_s_barrier_signal_var: 361 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 362 case Intrinsic::amdgcn_s_barrier_init: 363 case Intrinsic::amdgcn_s_barrier_join: 364 case Intrinsic::amdgcn_s_barrier_wait: 365 case Intrinsic::amdgcn_s_barrier_leave: 366 case Intrinsic::amdgcn_s_get_barrier_state: 367 case Intrinsic::amdgcn_wave_barrier: 368 case Intrinsic::amdgcn_sched_barrier: 369 case Intrinsic::amdgcn_sched_group_barrier: 370 return false; 371 default: 372 break; 373 } 374 } 375 376 // Ignore atomics not aliasing with the original load, any atomic is a 377 // universal MemoryDef from MSSA's point of view too, just like a fence. 378 const auto checkNoAlias = [AA, Ptr](auto I) -> bool { 379 return I && AA->isNoAlias(I->getPointerOperand(), Ptr); 380 }; 381 382 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || 383 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) 384 return false; 385 386 return true; 387 } 388 389 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, 390 AAResults *AA) { 391 MemorySSAWalker *Walker = MSSA->getWalker(); 392 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; 393 SmallSet<MemoryAccess *, 8> Visited; 394 MemoryLocation Loc(MemoryLocation::get(Load)); 395 396 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); 397 398 // Start with a nearest dominating clobbering access, it will be either 399 // live on entry (nothing to do, load is not clobbered), MemoryDef, or 400 // MemoryPhi if several MemoryDefs can define this memory state. In that 401 // case add all Defs to WorkList and continue going up and checking all 402 // the definitions of this memory location until the root. When all the 403 // defs are exhausted and came to the entry state we have no clobber. 404 // Along the scan ignore barriers and fences which are considered clobbers 405 // by the MemorySSA, but not really writing anything into the memory. 406 while (!WorkList.empty()) { 407 MemoryAccess *MA = WorkList.pop_back_val(); 408 if (!Visited.insert(MA).second) 409 continue; 410 411 if (MSSA->isLiveOnEntryDef(MA)) 412 continue; 413 414 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { 415 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); 416 417 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { 418 LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); 419 return true; 420 } 421 422 WorkList.push_back( 423 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); 424 continue; 425 } 426 427 const MemoryPhi *Phi = cast<MemoryPhi>(MA); 428 for (const auto &Use : Phi->incoming_values()) 429 WorkList.push_back(cast<MemoryAccess>(&Use)); 430 } 431 432 LLVM_DEBUG(dbgs() << " -> no clobber\n"); 433 return false; 434 } 435 436 } // end namespace llvm::AMDGPU 437