1 //===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers operations on buffer fat pointers (addrspace 7) to 10 // operations on buffer resources (addrspace 8) and is needed for correct 11 // codegen. 12 // 13 // # Background 14 // 15 // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists 16 // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. 17 // The buffer resource part needs to be it needs to be a "raw" buffer resource 18 // (it must have a stride of 0 and bounds checks must be in raw buffer mode 19 // or disabled). 20 // 21 // When these requirements are met, a buffer resource can be treated as a 22 // typical (though quite wide) pointer that follows typical LLVM pointer 23 // semantics. This allows the frontend to reason about such buffers (which are 24 // often encountered in the context of SPIR-V kernels). 25 // 26 // However, because of their non-power-of-2 size, these fat pointers cannot be 27 // present during translation to MIR (though this restriction may be lifted 28 // during the transition to GlobalISel). Therefore, this pass is needed in order 29 // to correctly implement these fat pointers. 30 // 31 // The resource intrinsics take the resource part (the address space 8 pointer) 32 // and the offset part (the 32-bit integer) as separate arguments. In addition, 33 // many users of these buffers manipulate the offset while leaving the resource 34 // part alone. For these reasons, we want to typically separate the resource 35 // and offset parts into separate variables, but combine them together when 36 // encountering cases where this is required, such as by inserting these values 37 // into aggretates or moving them to memory. 38 // 39 // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) 40 // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), 41 // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes 42 // `{vector<Nxp8>, vector<Nxi32 >}` and its component parts. 43 // 44 // # Implementation 45 // 46 // This pass proceeds in three main phases: 47 // 48 // ## Rewriting loads and stores of p7 49 // 50 // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, 51 // including aggregates containing such pointers, to ones that use `i160`. This 52 // is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and 53 // allocas and, if the loaded or stored type contains `ptr addrspace(7)`, 54 // rewrites that type to one where the p7s are replaced by i160s, copying other 55 // parts of aggregates as needed. In the case of a store, each pointer is 56 // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. 57 // This same transformation is applied to vectors of pointers. 58 // 59 // Such a transformation allows the later phases of the pass to not need 60 // to handle buffer fat pointers moving to and from memory, where we load 61 // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation 62 // and `Nxi60` directly. Instead, that transposing action (where the vectors 63 // of resources and vectors of offsets are concatentated before being stored to 64 // memory) are handled through implementing `inttoptr` and `ptrtoint` only. 65 // 66 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the 67 // hardware does not include a 160-bit atomic. 68 // 69 // ## Type remapping 70 // 71 // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers 72 // to the corresponding struct type, which has a resource part and an offset 73 // part. 74 // 75 // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` 76 // to, usually by way of `setType`ing values. Constants are handled here 77 // because there isn't a good way to fix them up later. 78 // 79 // This has the downside of leaving the IR in an invalid state (for example, 80 // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), 81 // but all such invalid states will be resolved by the third phase. 82 // 83 // Functions that don't take buffer fat pointers are modified in place. Those 84 // that do take such pointers have their basic blocks moved to a new function 85 // with arguments that are {ptr addrspace(8), i32} arguments and return values. 86 // This phase also records intrinsics so that they can be remangled or deleted 87 // later. 88 // 89 // 90 // ## Splitting pointer structs 91 // 92 // The meat of this pass consists of defining semantics for operations that 93 // produce or consume [vectors of] buffer fat pointers in terms of their 94 // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` 95 // visitor. 96 // 97 // In the first pass through each function that is being lowered, the splitter 98 // inserts new instructions to implement the split-structures behavior, which is 99 // needed for correctness and performance. It records a list of "split users", 100 // instructions that are being replaced by operations on the resource and offset 101 // parts. 102 // 103 // Split users do not necessarily need to produce parts themselves ( 104 // a `load float, ptr addrspace(7)` does not, for example), but, if they do not 105 // generate fat buffer pointers, they must RAUW in their replacement 106 // instructions during the initial visit. 107 // 108 // When these new instructions are created, they use the split parts recorded 109 // for their initial arguments in order to generate their replacements, creating 110 // a parallel set of instructions that does not refer to the original fat 111 // pointer values but instead to their resource and offset components. 112 // 113 // Instructions, such as `extractvalue`, that produce buffer fat pointers from 114 // sources that do not have split parts, have such parts generated using 115 // `extractvalue`. This is also the initial handling of PHI nodes, which 116 // are then cleaned up. 117 // 118 // ### Conditionals 119 // 120 // PHI nodes are initially given resource parts via `extractvalue`. However, 121 // this is not an efficient rewrite of such nodes, as, in most cases, the 122 // resource part in a conditional or loop remains constant throughout the loop 123 // and only the offset varies. Failing to optimize away these constant resources 124 // would cause additional registers to be sent around loops and might lead to 125 // waterfall loops being generated for buffer operations due to the 126 // "non-uniform" resource argument. 127 // 128 // Therefore, after all instructions have been visited, the pointer splitter 129 // post-processes all encountered conditionals. Given a PHI node or select, 130 // getPossibleRsrcRoots() collects all values that the resource parts of that 131 // conditional's input could come from as well as collecting all conditional 132 // instructions encountered during the search. If, after filtering out the 133 // initial node itself, the set of encountered conditionals is a subset of the 134 // potential roots and there is a single potential resource that isn't in the 135 // conditional set, that value is the only possible value the resource argument 136 // could have throughout the control flow. 137 // 138 // If that condition is met, then a PHI node can have its resource part changed 139 // to the singleton value and then be replaced by a PHI on the offsets. 140 // Otherwise, each PHI node is split into two, one for the resource part and one 141 // for the offset part, which replace the temporary `extractvalue` instructions 142 // that were added during the first pass. 143 // 144 // Similar logic applies to `select`, where 145 // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` 146 // can be split into `%z.rsrc = %x.rsrc` and 147 // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` 148 // if both `%x` and `%y` have the same resource part, but two `select` 149 // operations will be needed if they do not. 150 // 151 // ### Final processing 152 // 153 // After conditionals have been cleaned up, the IR for each function is 154 // rewritten to remove all the old instructions that have been split up. 155 // 156 // Any instruction that used to produce a buffer fat pointer (and therefore now 157 // produces a resource-and-offset struct after type remapping) is 158 // replaced as follows: 159 // 1. All debug value annotations are cloned to reflect that the resource part 160 // and offset parts are computed separately and constitute different 161 // fragments of the underlying source language variable. 162 // 2. All uses that were themselves split are replaced by a `poison` of the 163 // struct type, as they will themselves be erased soon. This rule, combined 164 // with debug handling, should leave the use lists of split instructions 165 // empty in almost all cases. 166 // 3. If a user of the original struct-valued result remains, the structure 167 // needed for the new types to work is constructed out of the newly-defined 168 // parts, and the original instruction is replaced by this structure 169 // before being erased. Instructions requiring this construction include 170 // `ret` and `insertvalue`. 171 // 172 // # Consequences 173 // 174 // This pass does not alter the CFG. 175 // 176 // Alias analysis information will become coarser, as the LLVM alias analyzer 177 // cannot handle the buffer intrinsics. Specifically, while we can determine 178 // that the following two loads do not alias: 179 // ``` 180 // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 181 // %a = load i32, ptr addrspace(7) %x 182 // %b = load i32, ptr addrspace(7) %y 183 // ``` 184 // we cannot (except through some code that runs during scheduling) determine 185 // that the rewritten loads below do not alias. 186 // ``` 187 // %y.off = add i32 %x.off, 1 188 // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 189 // %x.off, ...) 190 // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) 191 // %x.rsrc, i32 %y.off, ...) 192 // ``` 193 // However, existing alias information is preserved. 194 //===----------------------------------------------------------------------===// 195 196 #include "AMDGPU.h" 197 #include "AMDGPUTargetMachine.h" 198 #include "GCNSubtarget.h" 199 #include "SIDefines.h" 200 #include "llvm/ADT/SetOperations.h" 201 #include "llvm/ADT/SmallVector.h" 202 #include "llvm/Analysis/ConstantFolding.h" 203 #include "llvm/Analysis/Utils/Local.h" 204 #include "llvm/CodeGen/TargetPassConfig.h" 205 #include "llvm/IR/AttributeMask.h" 206 #include "llvm/IR/Constants.h" 207 #include "llvm/IR/DebugInfo.h" 208 #include "llvm/IR/DerivedTypes.h" 209 #include "llvm/IR/IRBuilder.h" 210 #include "llvm/IR/InstIterator.h" 211 #include "llvm/IR/InstVisitor.h" 212 #include "llvm/IR/Instructions.h" 213 #include "llvm/IR/Intrinsics.h" 214 #include "llvm/IR/IntrinsicsAMDGPU.h" 215 #include "llvm/IR/Metadata.h" 216 #include "llvm/IR/Operator.h" 217 #include "llvm/IR/PatternMatch.h" 218 #include "llvm/InitializePasses.h" 219 #include "llvm/Pass.h" 220 #include "llvm/Support/AtomicOrdering.h" 221 #include "llvm/Support/Debug.h" 222 #include "llvm/Support/ErrorHandling.h" 223 #include "llvm/Transforms/Utils/Cloning.h" 224 #include "llvm/Transforms/Utils/Local.h" 225 #include "llvm/Transforms/Utils/ValueMapper.h" 226 227 #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" 228 229 using namespace llvm; 230 231 static constexpr unsigned BufferOffsetWidth = 32; 232 233 namespace { 234 /// Recursively replace instances of ptr addrspace(7) and vector<Nxptr 235 /// addrspace(7)> with some other type as defined by the relevant subclass. 236 class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { 237 DenseMap<Type *, Type *> Map; 238 239 Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl<StructType *> &Seen); 240 241 protected: 242 virtual Type *remapScalar(PointerType *PT) = 0; 243 virtual Type *remapVector(VectorType *VT) = 0; 244 245 const DataLayout &DL; 246 247 public: 248 BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} 249 Type *remapType(Type *SrcTy) override; 250 void clear() { Map.clear(); } 251 }; 252 253 /// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to 254 /// vector<Nxi60> in order to correctly handling loading/storing these values 255 /// from memory. 256 class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { 257 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 258 259 protected: 260 Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } 261 Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } 262 }; 263 264 /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset 265 /// parts of the pointer) so that we can easily rewrite operations on these 266 /// values that aren't loading them from or storing them to memory. 267 class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { 268 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 269 270 protected: 271 Type *remapScalar(PointerType *PT) override; 272 Type *remapVector(VectorType *VT) override; 273 }; 274 } // namespace 275 276 // This code is adapted from the type remapper in lib/Linker/IRMover.cpp 277 Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( 278 Type *Ty, SmallPtrSetImpl<StructType *> &Seen) { 279 Type **Entry = &Map[Ty]; 280 if (*Entry) 281 return *Entry; 282 if (auto *PT = dyn_cast<PointerType>(Ty)) { 283 if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 284 return *Entry = remapScalar(PT); 285 } 286 } 287 if (auto *VT = dyn_cast<VectorType>(Ty)) { 288 auto *PT = dyn_cast<PointerType>(VT->getElementType()); 289 if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 290 return *Entry = remapVector(VT); 291 } 292 return *Entry = Ty; 293 } 294 // Whether the type is one that is structurally uniqued - that is, if it is 295 // not a named struct (the only kind of type where multiple structurally 296 // identical types that have a distinct `Type*`) 297 StructType *TyAsStruct = dyn_cast<StructType>(Ty); 298 bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral(); 299 // Base case for ints, floats, opaque pointers, and so on, which don't 300 // require recursion. 301 if (Ty->getNumContainedTypes() == 0 && IsUniqued) 302 return *Entry = Ty; 303 if (!IsUniqued) { 304 // Create a dummy type for recursion purposes. 305 if (!Seen.insert(TyAsStruct).second) { 306 StructType *Placeholder = StructType::create(Ty->getContext()); 307 return *Entry = Placeholder; 308 } 309 } 310 bool Changed = false; 311 SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr); 312 for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { 313 Type *OldElem = Ty->getContainedType(I); 314 Type *NewElem = remapTypeImpl(OldElem, Seen); 315 ElementTypes[I] = NewElem; 316 Changed |= (OldElem != NewElem); 317 } 318 // Recursive calls to remapTypeImpl() may have invalidated pointer. 319 Entry = &Map[Ty]; 320 if (!Changed) { 321 return *Entry = Ty; 322 } 323 if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) 324 return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); 325 if (auto *FnTy = dyn_cast<FunctionType>(Ty)) 326 return *Entry = FunctionType::get(ElementTypes[0], 327 ArrayRef(ElementTypes).slice(1), 328 FnTy->isVarArg()); 329 if (auto *STy = dyn_cast<StructType>(Ty)) { 330 // Genuine opaque types don't have a remapping. 331 if (STy->isOpaque()) 332 return *Entry = Ty; 333 bool IsPacked = STy->isPacked(); 334 if (IsUniqued) 335 return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); 336 SmallString<16> Name(STy->getName()); 337 STy->setName(""); 338 Type **RecursionEntry = &Map[Ty]; 339 if (*RecursionEntry) { 340 auto *Placeholder = cast<StructType>(*RecursionEntry); 341 Placeholder->setBody(ElementTypes, IsPacked); 342 Placeholder->setName(Name); 343 return *Entry = Placeholder; 344 } 345 return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, 346 IsPacked); 347 } 348 llvm_unreachable("Unknown type of type that contains elements"); 349 } 350 351 Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { 352 SmallPtrSet<StructType *, 2> Visited; 353 return remapTypeImpl(SrcTy, Visited); 354 } 355 356 Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { 357 LLVMContext &Ctx = PT->getContext(); 358 return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), 359 IntegerType::get(Ctx, BufferOffsetWidth)); 360 } 361 362 Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { 363 ElementCount EC = VT->getElementCount(); 364 LLVMContext &Ctx = VT->getContext(); 365 Type *RsrcVec = 366 VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); 367 Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); 368 return StructType::get(RsrcVec, OffVec); 369 } 370 371 static bool isBufferFatPtrOrVector(Type *Ty) { 372 if (auto *PT = dyn_cast<PointerType>(Ty->getScalarType())) 373 return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; 374 return false; 375 } 376 377 // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of 378 // those types. Used to quickly skip instructions we don't need to process. 379 static bool isSplitFatPtr(Type *Ty) { 380 auto *ST = dyn_cast<StructType>(Ty); 381 if (!ST) 382 return false; 383 if (!ST->isLiteral() || ST->getNumElements() != 2) 384 return false; 385 auto *MaybeRsrc = 386 dyn_cast<PointerType>(ST->getElementType(0)->getScalarType()); 387 auto *MaybeOff = 388 dyn_cast<IntegerType>(ST->getElementType(1)->getScalarType()); 389 return MaybeRsrc && MaybeOff && 390 MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && 391 MaybeOff->getBitWidth() == BufferOffsetWidth; 392 } 393 394 // True if the result type or any argument types are buffer fat pointers. 395 static bool isBufferFatPtrConst(Constant *C) { 396 Type *T = C->getType(); 397 return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) { 398 return isBufferFatPtrOrVector(U.get()->getType()); 399 }); 400 } 401 402 namespace { 403 /// Convert [vectors of] buffer fat pointers to integers when they are read from 404 /// or stored to memory. This ensures that these pointers will have the same 405 /// memory layout as before they are lowered, even though they will no longer 406 /// have their previous layout in registers/in the program (they'll be broken 407 /// down into resource and offset parts). This has the downside of imposing 408 /// marshalling costs when reading or storing these values, but since placing 409 /// such pointers into memory is an uncommon operation at best, we feel that 410 /// this cost is acceptable for better performance in the common case. 411 class StoreFatPtrsAsIntsVisitor 412 : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> { 413 BufferFatPtrToIntTypeMap *TypeMap; 414 415 ValueToValueMapTy ConvertedForStore; 416 417 IRBuilder<> IRB; 418 419 // Convert all the buffer fat pointers within the input value to inttegers 420 // so that it can be stored in memory. 421 Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); 422 // Convert all the i160s that need to be buffer fat pointers (as specified) 423 // by the To type) into those pointers to preserve the semantics of the rest 424 // of the program. 425 Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); 426 427 public: 428 StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) 429 : TypeMap(TypeMap), IRB(Ctx) {} 430 bool processFunction(Function &F); 431 432 bool visitInstruction(Instruction &I) { return false; } 433 bool visitAllocaInst(AllocaInst &I); 434 bool visitLoadInst(LoadInst &LI); 435 bool visitStoreInst(StoreInst &SI); 436 bool visitGetElementPtrInst(GetElementPtrInst &I); 437 }; 438 } // namespace 439 440 Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, 441 const Twine &Name) { 442 if (From == To) 443 return V; 444 ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); 445 if (Find != ConvertedForStore.end()) 446 return Find->second; 447 if (isBufferFatPtrOrVector(From)) { 448 Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); 449 ConvertedForStore[V] = Cast; 450 return Cast; 451 } 452 if (From->getNumContainedTypes() == 0) 453 return V; 454 // Structs, arrays, and other compound types. 455 Value *Ret = PoisonValue::get(To); 456 if (auto *AT = dyn_cast<ArrayType>(From)) { 457 Type *FromPart = AT->getArrayElementType(); 458 Type *ToPart = cast<ArrayType>(To)->getElementType(); 459 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 460 Value *Field = IRB.CreateExtractValue(V, I); 461 Value *NewField = 462 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); 463 Ret = IRB.CreateInsertValue(Ret, NewField, I); 464 } 465 } else { 466 for (auto [Idx, FromPart, ToPart] : 467 enumerate(From->subtypes(), To->subtypes())) { 468 Value *Field = IRB.CreateExtractValue(V, Idx); 469 Value *NewField = 470 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 471 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 472 } 473 } 474 ConvertedForStore[V] = Ret; 475 return Ret; 476 } 477 478 Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, 479 const Twine &Name) { 480 if (From == To) 481 return V; 482 if (isBufferFatPtrOrVector(To)) { 483 Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); 484 return Cast; 485 } 486 if (From->getNumContainedTypes() == 0) 487 return V; 488 // Structs, arrays, and other compound types. 489 Value *Ret = PoisonValue::get(To); 490 if (auto *AT = dyn_cast<ArrayType>(From)) { 491 Type *FromPart = AT->getArrayElementType(); 492 Type *ToPart = cast<ArrayType>(To)->getElementType(); 493 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 494 Value *Field = IRB.CreateExtractValue(V, I); 495 Value *NewField = 496 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); 497 Ret = IRB.CreateInsertValue(Ret, NewField, I); 498 } 499 } else { 500 for (auto [Idx, FromPart, ToPart] : 501 enumerate(From->subtypes(), To->subtypes())) { 502 Value *Field = IRB.CreateExtractValue(V, Idx); 503 Value *NewField = 504 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 505 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 506 } 507 } 508 return Ret; 509 } 510 511 bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { 512 bool Changed = false; 513 // The visitors will mutate GEPs and allocas, but will push loads and stores 514 // to the worklist to avoid invalidation. 515 for (Instruction &I : make_early_inc_range(instructions(F))) { 516 Changed |= visit(I); 517 } 518 ConvertedForStore.clear(); 519 return Changed; 520 } 521 522 bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { 523 Type *Ty = I.getAllocatedType(); 524 Type *NewTy = TypeMap->remapType(Ty); 525 if (Ty == NewTy) 526 return false; 527 I.setAllocatedType(NewTy); 528 return true; 529 } 530 531 bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { 532 Type *Ty = I.getSourceElementType(); 533 Type *NewTy = TypeMap->remapType(Ty); 534 if (Ty == NewTy) 535 return false; 536 // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so 537 // make sure GEPs don't have different semantics with the new type. 538 I.setSourceElementType(NewTy); 539 I.setResultElementType(TypeMap->remapType(I.getResultElementType())); 540 return true; 541 } 542 543 bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { 544 Type *Ty = LI.getType(); 545 Type *IntTy = TypeMap->remapType(Ty); 546 if (Ty == IntTy) 547 return false; 548 549 IRB.SetInsertPoint(&LI); 550 auto *NLI = cast<LoadInst>(LI.clone()); 551 NLI->mutateType(IntTy); 552 NLI = IRB.Insert(NLI); 553 copyMetadataForLoad(*NLI, LI); 554 NLI->takeName(&LI); 555 556 Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); 557 LI.replaceAllUsesWith(CastBack); 558 LI.eraseFromParent(); 559 return true; 560 } 561 562 bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { 563 Value *V = SI.getValueOperand(); 564 Type *Ty = V->getType(); 565 Type *IntTy = TypeMap->remapType(Ty); 566 if (Ty == IntTy) 567 return false; 568 569 IRB.SetInsertPoint(&SI); 570 Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); 571 for (auto *Dbg : at::getAssignmentMarkers(&SI)) 572 Dbg->setValue(IntV); 573 574 SI.setOperand(0, IntV); 575 return true; 576 } 577 578 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered 579 /// buffer fat pointer constant. 580 static std::pair<Constant *, Constant *> 581 splitLoweredFatBufferConst(Constant *C) { 582 if (auto *AZ = dyn_cast<ConstantAggregateZero>(C)) 583 return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1)); 584 if (auto *SC = dyn_cast<ConstantStruct>(C)) 585 return std::make_pair(SC->getOperand(0), SC->getOperand(1)); 586 llvm_unreachable("Conversion should've created a {p8, i32} struct"); 587 } 588 589 namespace { 590 /// Handle the remapping of ptr addrspace(7) constants. 591 class FatPtrConstMaterializer final : public ValueMaterializer { 592 BufferFatPtrToStructTypeMap *TypeMap; 593 BufferFatPtrToIntTypeMap *IntTypeMap; 594 // An internal mapper that is used to recurse into the arguments of constants. 595 // While the documentation for `ValueMapper` specifies not to use it 596 // recursively, examination of the logic in mapValue() shows that it can 597 // safely be used recursively when handling constants, like it does in its own 598 // logic. 599 ValueMapper InternalMapper; 600 601 Constant *materializeBufferFatPtrConst(Constant *C); 602 603 const DataLayout &DL; 604 605 public: 606 // UnderlyingMap is the value map this materializer will be filling. 607 FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, 608 ValueToValueMapTy &UnderlyingMap, 609 BufferFatPtrToIntTypeMap *IntTypeMap, 610 const DataLayout &DL) 611 : TypeMap(TypeMap), IntTypeMap(IntTypeMap), 612 InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {} 613 virtual ~FatPtrConstMaterializer() = default; 614 615 Value *materialize(Value *V) override; 616 }; 617 } // namespace 618 619 Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { 620 Type *SrcTy = C->getType(); 621 auto *NewTy = dyn_cast<StructType>(TypeMap->remapType(SrcTy)); 622 if (C->isNullValue()) 623 return ConstantAggregateZero::getNullValue(NewTy); 624 if (isa<PoisonValue>(C)) { 625 return ConstantStruct::get(NewTy, 626 {PoisonValue::get(NewTy->getElementType(0)), 627 PoisonValue::get(NewTy->getElementType(1))}); 628 } 629 if (isa<UndefValue>(C)) { 630 return ConstantStruct::get(NewTy, 631 {UndefValue::get(NewTy->getElementType(0)), 632 UndefValue::get(NewTy->getElementType(1))}); 633 } 634 635 if (isa<GlobalValue>(C)) 636 report_fatal_error("Global values containing ptr addrspace(7) (buffer " 637 "fat pointer) values are not supported"); 638 639 if (auto *VC = dyn_cast<ConstantVector>(C)) { 640 if (Constant *S = VC->getSplatValue()) { 641 Constant *NewS = InternalMapper.mapConstant(*S); 642 if (!NewS) 643 return nullptr; 644 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); 645 auto EC = VC->getType()->getElementCount(); 646 return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), 647 ConstantVector::getSplat(EC, Off)}); 648 } 649 SmallVector<Constant *> Rsrcs; 650 SmallVector<Constant *> Offs; 651 for (Value *Op : VC->operand_values()) { 652 auto *NewOp = dyn_cast_or_null<Constant>(InternalMapper.mapValue(*Op)); 653 if (!NewOp) 654 return nullptr; 655 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); 656 Rsrcs.push_back(Rsrc); 657 Offs.push_back(Off); 658 } 659 Constant *RsrcVec = ConstantVector::get(Rsrcs); 660 Constant *OffVec = ConstantVector::get(Offs); 661 return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); 662 } 663 664 // Constant expressions. This code mirrors how we fix up the equivalent 665 // instructions later. 666 auto *CE = dyn_cast<ConstantExpr>(C); 667 if (!CE) 668 return nullptr; 669 if (auto *GEPO = dyn_cast<GEPOperator>(C)) { 670 Constant *RemappedPtr = 671 InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand())); 672 auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr); 673 Type *OffTy = Off->getType(); 674 bool InBounds = GEPO->isInBounds(); 675 676 MapVector<Value *, APInt> VariableOffs; 677 APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth); 678 if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs, 679 NewConstOffVal)) 680 report_fatal_error( 681 "Scalable vector or unsized struct in fat pointer GEP"); 682 Constant *OffAccum = nullptr; 683 // Accumulate offsets together before adding to the base in order to 684 // preserve as many of the inbounds properties as possible. 685 for (auto [Arg, Multiple] : VariableOffs) { 686 Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg)); 687 NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL); 688 if (!Multiple.isOne()) { 689 if (Multiple.isPowerOf2()) { 690 NewArg = ConstantExpr::getShl( 691 NewArg, 692 CE->getIntegerValue( 693 OffTy, APInt(BufferOffsetWidth, Multiple.logBase2())), 694 /*hasNUW=*/InBounds, /*HasNSW=*/InBounds); 695 } else { 696 NewArg = 697 ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple), 698 /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); 699 } 700 } 701 if (OffAccum) { 702 OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds, 703 /*hasNSW=*/InBounds); 704 } else { 705 OffAccum = NewArg; 706 } 707 } 708 Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal); 709 if (OffAccum) 710 OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff, 711 /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); 712 else 713 OffAccum = NewConstOff; 714 bool HasNonNegativeOff = false; 715 if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { 716 HasNonNegativeOff = !CI->isNegative(); 717 } 718 Constant *NewOff = ConstantExpr::getAdd( 719 Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff, 720 /*hasNSW=*/false); 721 return ConstantStruct::get(NewTy, {Rsrc, NewOff}); 722 } 723 724 if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) { 725 Constant *Parts = 726 InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand())); 727 auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts); 728 // Here, we take advantage of the fact that ptrtoint has a built-in 729 // zero-extension behavior. 730 unsigned FatPtrWidth = 731 DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 732 Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy); 733 unsigned Width = SrcTy->getScalarSizeInBits(); 734 Constant *Shift = 735 CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth)); 736 Constant *OffCast = 737 ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL); 738 Constant *RsrcHi = ConstantExpr::getShl( 739 RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth); 740 // This should be an or, but those got recently removed. 741 Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true); 742 return Result; 743 } 744 745 if (CE->getOpcode() == Instruction::IntToPtr) { 746 auto *Arg = cast<Constant>(CE->getOperand(0)); 747 unsigned FatPtrWidth = 748 DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 749 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); 750 auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth); 751 Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL); 752 753 Constant *Shift = 754 CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth)); 755 Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth); 756 Type *RsrcTy = NewTy->getElementType(0); 757 Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth); 758 Constant *RsrcInt = CE->getTrunc( 759 ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL), 760 RsrcIntType); 761 Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy); 762 Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL); 763 764 return ConstantStruct::get(NewTy, {Rsrc, Off}); 765 } 766 767 if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) { 768 unsigned SrcAS = AC->getSrcAddressSpace(); 769 unsigned DstAS = AC->getDestAddressSpace(); 770 auto *Arg = cast<Constant>(AC->getPointerOperand()); 771 auto *NewArg = InternalMapper.mapConstant(*Arg); 772 if (!NewArg) 773 return nullptr; 774 if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER && 775 DstAS == AMDGPUAS::BUFFER_FAT_POINTER) 776 return NewArg; 777 if (SrcAS == AMDGPUAS::BUFFER_RESOURCE && 778 DstAS == AMDGPUAS::BUFFER_FAT_POINTER) { 779 auto *NullOff = CE->getNullValue(NewTy->getElementType(1)); 780 return ConstantStruct::get(NewTy, {NewArg, NullOff}); 781 } 782 report_fatal_error( 783 "Unsupported address space cast for a buffer fat pointer"); 784 } 785 return nullptr; 786 } 787 788 Value *FatPtrConstMaterializer::materialize(Value *V) { 789 Constant *C = dyn_cast<Constant>(V); 790 if (!C) 791 return nullptr; 792 if (auto *GEPO = dyn_cast<GEPOperator>(C)) { 793 // As a special case, adjust GEP constants that have a ptr addrspace(7) in 794 // their source types here, since the earlier local changes didn't handle 795 // htis. 796 Type *SrcTy = GEPO->getSourceElementType(); 797 Type *NewSrcTy = IntTypeMap->remapType(SrcTy); 798 if (SrcTy != NewSrcTy) { 799 SmallVector<Constant *> Ops; 800 Ops.reserve(GEPO->getNumOperands()); 801 for (const Use &U : GEPO->operands()) 802 Ops.push_back(cast<Constant>(U.get())); 803 auto *NewGEP = ConstantExpr::getGetElementPtr( 804 NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1), 805 GEPO->getNoWrapFlags(), GEPO->getInRange()); 806 LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP 807 << "\n"); 808 Value *FurtherMap = materialize(NewGEP); 809 return FurtherMap ? FurtherMap : NewGEP; 810 } 811 } 812 // Structs and other types that happen to contain fat pointers get remapped 813 // by the mapValue() logic. 814 if (!isBufferFatPtrConst(C)) 815 return nullptr; 816 return materializeBufferFatPtrConst(C); 817 } 818 819 using PtrParts = std::pair<Value *, Value *>; 820 namespace { 821 // The visitor returns the resource and offset parts for an instruction if they 822 // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful 823 // value mapping. 824 class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> { 825 ValueToValueMapTy RsrcParts; 826 ValueToValueMapTy OffParts; 827 828 // Track instructions that have been rewritten into a user of the component 829 // parts of their ptr addrspace(7) input. Instructions that produced 830 // ptr addrspace(7) parts should **not** be RAUW'd before being added to this 831 // set, as that replacement will be handled in a post-visit step. However, 832 // instructions that yield values that aren't fat pointers (ex. ptrtoint) 833 // should RAUW themselves with new instructions that use the split parts 834 // of their arguments during processing. 835 DenseSet<Instruction *> SplitUsers; 836 837 // Nodes that need a second look once we've computed the parts for all other 838 // instructions to see if, for example, we really need to phi on the resource 839 // part. 840 SmallVector<Instruction *> Conditionals; 841 // Temporary instructions produced while lowering conditionals that should be 842 // killed. 843 SmallVector<Instruction *> ConditionalTemps; 844 845 // Subtarget info, needed for determining what cache control bits to set. 846 const TargetMachine *TM; 847 const GCNSubtarget *ST; 848 849 IRBuilder<> IRB; 850 851 // Copy metadata between instructions if applicable. 852 void copyMetadata(Value *Dest, Value *Src); 853 854 // Get the resource and offset parts of the value V, inserting appropriate 855 // extractvalue calls if needed. 856 PtrParts getPtrParts(Value *V); 857 858 // Given an instruction that could produce multiple resource parts (a PHI or 859 // select), collect the set of possible instructions that could have provided 860 // its resource parts that it could have (the `Roots`) and the set of 861 // conditional instructions visited during the search (`Seen`). If, after 862 // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset 863 // of `Roots` and `Roots - Seen` contains one element, the resource part of 864 // that element can replace the resource part of all other elements in `Seen`. 865 void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots, 866 SmallPtrSetImpl<Value *> &Seen); 867 void processConditionals(); 868 869 // If an instruction hav been split into resource and offset parts, 870 // delete that instruction. If any of its uses have not themselves been split 871 // into parts (for example, an insertvalue), construct the structure 872 // that the type rewrites declared should be produced by the dying instruction 873 // and use that. 874 // Also, kill the temporary extractvalue operations produced by the two-stage 875 // lowering of PHIs and conditionals. 876 void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs); 877 878 void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx); 879 void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 880 void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 881 Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, 882 Align Alignment, AtomicOrdering Order, 883 bool IsVolatile, SyncScope::ID SSID); 884 885 public: 886 SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) 887 : TM(TM), ST(nullptr), IRB(Ctx) {} 888 889 void processFunction(Function &F); 890 891 PtrParts visitInstruction(Instruction &I); 892 PtrParts visitLoadInst(LoadInst &LI); 893 PtrParts visitStoreInst(StoreInst &SI); 894 PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); 895 PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); 896 PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); 897 898 PtrParts visitPtrToIntInst(PtrToIntInst &PI); 899 PtrParts visitIntToPtrInst(IntToPtrInst &IP); 900 PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); 901 PtrParts visitICmpInst(ICmpInst &Cmp); 902 PtrParts visitFreezeInst(FreezeInst &I); 903 904 PtrParts visitExtractElementInst(ExtractElementInst &I); 905 PtrParts visitInsertElementInst(InsertElementInst &I); 906 PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); 907 908 PtrParts visitPHINode(PHINode &PHI); 909 PtrParts visitSelectInst(SelectInst &SI); 910 911 PtrParts visitIntrinsicInst(IntrinsicInst &II); 912 }; 913 } // namespace 914 915 void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { 916 auto *DestI = dyn_cast<Instruction>(Dest); 917 auto *SrcI = dyn_cast<Instruction>(Src); 918 919 if (!DestI || !SrcI) 920 return; 921 922 DestI->copyMetadata(*SrcI); 923 } 924 925 PtrParts SplitPtrStructs::getPtrParts(Value *V) { 926 assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " 927 "of something that wasn't rewritten"); 928 auto *RsrcEntry = &RsrcParts[V]; 929 auto *OffEntry = &OffParts[V]; 930 if (*RsrcEntry && *OffEntry) 931 return {*RsrcEntry, *OffEntry}; 932 933 if (auto *C = dyn_cast<Constant>(V)) { 934 auto [Rsrc, Off] = splitLoweredFatBufferConst(C); 935 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 936 } 937 938 IRBuilder<>::InsertPointGuard Guard(IRB); 939 if (auto *I = dyn_cast<Instruction>(V)) { 940 LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); 941 auto [Rsrc, Off] = visit(*I); 942 if (Rsrc && Off) 943 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 944 // We'll be creating the new values after the relevant instruction. 945 // This instruction generates a value and so isn't a terminator. 946 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 947 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 948 } else if (auto *A = dyn_cast<Argument>(V)) { 949 IRB.SetInsertPointPastAllocas(A->getParent()); 950 IRB.SetCurrentDebugLocation(DebugLoc()); 951 } 952 Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); 953 Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); 954 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 955 } 956 957 /// Returns the instruction that defines the resource part of the value V. 958 /// Note that this is not getUnderlyingObject(), since that looks through 959 /// operations like ptrmask which might modify the resource part. 960 /// 961 /// We can limit ourselves to just looking through GEPs followed by looking 962 /// through addrspacecasts because only those two operations preserve the 963 /// resource part, and because operations on an `addrspace(8)` (which is the 964 /// legal input to this addrspacecast) would produce a different resource part. 965 static Value *rsrcPartRoot(Value *V) { 966 while (auto *GEP = dyn_cast<GEPOperator>(V)) 967 V = GEP->getPointerOperand(); 968 while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(V)) 969 V = ASC->getPointerOperand(); 970 return V; 971 } 972 973 void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, 974 SmallPtrSetImpl<Value *> &Roots, 975 SmallPtrSetImpl<Value *> &Seen) { 976 if (auto *PHI = dyn_cast<PHINode>(I)) { 977 if (!Seen.insert(I).second) 978 return; 979 for (Value *In : PHI->incoming_values()) { 980 In = rsrcPartRoot(In); 981 Roots.insert(In); 982 if (isa<PHINode, SelectInst>(In)) 983 getPossibleRsrcRoots(cast<Instruction>(In), Roots, Seen); 984 } 985 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 986 if (!Seen.insert(SI).second) 987 return; 988 Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); 989 Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); 990 Roots.insert(TrueVal); 991 Roots.insert(FalseVal); 992 if (isa<PHINode, SelectInst>(TrueVal)) 993 getPossibleRsrcRoots(cast<Instruction>(TrueVal), Roots, Seen); 994 if (isa<PHINode, SelectInst>(FalseVal)) 995 getPossibleRsrcRoots(cast<Instruction>(FalseVal), Roots, Seen); 996 } else { 997 llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); 998 } 999 } 1000 1001 void SplitPtrStructs::processConditionals() { 1002 SmallDenseMap<Instruction *, Value *> FoundRsrcs; 1003 SmallPtrSet<Value *, 4> Roots; 1004 SmallPtrSet<Value *, 4> Seen; 1005 for (Instruction *I : Conditionals) { 1006 // These have to exist by now because we've visited these nodes. 1007 Value *Rsrc = RsrcParts[I]; 1008 Value *Off = OffParts[I]; 1009 assert(Rsrc && Off && "must have visited conditionals by now"); 1010 1011 std::optional<Value *> MaybeRsrc; 1012 auto MaybeFoundRsrc = FoundRsrcs.find(I); 1013 if (MaybeFoundRsrc != FoundRsrcs.end()) { 1014 MaybeRsrc = MaybeFoundRsrc->second; 1015 } else { 1016 IRBuilder<>::InsertPointGuard Guard(IRB); 1017 Roots.clear(); 1018 Seen.clear(); 1019 getPossibleRsrcRoots(I, Roots, Seen); 1020 LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n"); 1021 #ifndef NDEBUG 1022 for (Value *V : Roots) 1023 LLVM_DEBUG(dbgs() << "Root: " << *V << "\n"); 1024 for (Value *V : Seen) 1025 LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n"); 1026 #endif 1027 // If we are our own possible root, then we shouldn't block our 1028 // replacement with a valid incoming value. 1029 Roots.erase(I); 1030 // We don't want to block the optimization for conditionals that don't 1031 // refer to themselves but did see themselves during the traversal. 1032 Seen.erase(I); 1033 1034 if (set_is_subset(Seen, Roots)) { 1035 auto Diff = set_difference(Roots, Seen); 1036 if (Diff.size() == 1) { 1037 Value *RootVal = *Diff.begin(); 1038 // Handle the case where previous loops already looked through 1039 // an addrspacecast. 1040 if (isSplitFatPtr(RootVal->getType())) 1041 MaybeRsrc = std::get<0>(getPtrParts(RootVal)); 1042 else 1043 MaybeRsrc = RootVal; 1044 } 1045 } 1046 } 1047 1048 if (auto *PHI = dyn_cast<PHINode>(I)) { 1049 Value *NewRsrc; 1050 StructType *PHITy = cast<StructType>(PHI->getType()); 1051 IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef()); 1052 IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); 1053 if (MaybeRsrc) { 1054 NewRsrc = *MaybeRsrc; 1055 } else { 1056 Type *RsrcTy = PHITy->getElementType(0); 1057 auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); 1058 RsrcPHI->takeName(Rsrc); 1059 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 1060 Value *VRsrc = std::get<0>(getPtrParts(V)); 1061 RsrcPHI->addIncoming(VRsrc, BB); 1062 } 1063 copyMetadata(RsrcPHI, PHI); 1064 NewRsrc = RsrcPHI; 1065 } 1066 1067 Type *OffTy = PHITy->getElementType(1); 1068 auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); 1069 NewOff->takeName(Off); 1070 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 1071 assert(OffParts.count(V) && "An offset part had to be created by now"); 1072 Value *VOff = std::get<1>(getPtrParts(V)); 1073 NewOff->addIncoming(VOff, BB); 1074 } 1075 copyMetadata(NewOff, PHI); 1076 1077 // Note: We don't eraseFromParent() the temporaries because we don't want 1078 // to put the corrections maps in an inconstent state. That'll be handed 1079 // during the rest of the killing. Also, `ValueToValueMapTy` guarantees 1080 // that references in that map will be updated as well. 1081 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 1082 ConditionalTemps.push_back(cast<Instruction>(Off)); 1083 Rsrc->replaceAllUsesWith(NewRsrc); 1084 Off->replaceAllUsesWith(NewOff); 1085 1086 // Save on recomputing the cycle traversals in known-root cases. 1087 if (MaybeRsrc) 1088 for (Value *V : Seen) 1089 FoundRsrcs[cast<Instruction>(V)] = NewRsrc; 1090 } else if (isa<SelectInst>(I)) { 1091 if (MaybeRsrc) { 1092 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 1093 Rsrc->replaceAllUsesWith(*MaybeRsrc); 1094 for (Value *V : Seen) 1095 FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc; 1096 } 1097 } else { 1098 llvm_unreachable("Only PHIs and selects go in the conditionals list"); 1099 } 1100 } 1101 } 1102 1103 void SplitPtrStructs::killAndReplaceSplitInstructions( 1104 SmallVectorImpl<Instruction *> &Origs) { 1105 for (Instruction *I : ConditionalTemps) 1106 I->eraseFromParent(); 1107 1108 for (Instruction *I : Origs) { 1109 if (!SplitUsers.contains(I)) 1110 continue; 1111 1112 SmallVector<DbgValueInst *> Dbgs; 1113 findDbgValues(Dbgs, I); 1114 for (auto *Dbg : Dbgs) { 1115 IRB.SetInsertPoint(Dbg); 1116 auto &DL = I->getModule()->getDataLayout(); 1117 assert(isSplitFatPtr(I->getType()) && 1118 "We should've RAUW'd away loads, stores, etc. at this point"); 1119 auto *OffDbg = cast<DbgValueInst>(Dbg->clone()); 1120 copyMetadata(OffDbg, Dbg); 1121 auto [Rsrc, Off] = getPtrParts(I); 1122 1123 int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); 1124 int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); 1125 1126 std::optional<DIExpression *> RsrcExpr = 1127 DIExpression::createFragmentExpression(Dbg->getExpression(), 0, 1128 RsrcSz); 1129 std::optional<DIExpression *> OffExpr = 1130 DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, 1131 OffSz); 1132 if (OffExpr) { 1133 OffDbg->setExpression(*OffExpr); 1134 OffDbg->replaceVariableLocationOp(I, Off); 1135 IRB.Insert(OffDbg); 1136 } else { 1137 OffDbg->deleteValue(); 1138 } 1139 if (RsrcExpr) { 1140 Dbg->setExpression(*RsrcExpr); 1141 Dbg->replaceVariableLocationOp(I, Rsrc); 1142 } else { 1143 Dbg->replaceVariableLocationOp(I, UndefValue::get(I->getType())); 1144 } 1145 } 1146 1147 Value *Poison = PoisonValue::get(I->getType()); 1148 I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { 1149 if (const auto *UI = dyn_cast<Instruction>(U.getUser())) 1150 return SplitUsers.contains(UI); 1151 return false; 1152 }); 1153 1154 if (I->use_empty()) { 1155 I->eraseFromParent(); 1156 continue; 1157 } 1158 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 1159 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 1160 auto [Rsrc, Off] = getPtrParts(I); 1161 Value *Struct = PoisonValue::get(I->getType()); 1162 Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); 1163 Struct = IRB.CreateInsertValue(Struct, Off, 1); 1164 copyMetadata(Struct, I); 1165 Struct->takeName(I); 1166 I->replaceAllUsesWith(Struct); 1167 I->eraseFromParent(); 1168 } 1169 } 1170 1171 void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { 1172 LLVMContext &Ctx = Intr->getContext(); 1173 Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A)); 1174 } 1175 1176 void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, 1177 SyncScope::ID SSID) { 1178 switch (Order) { 1179 case AtomicOrdering::Release: 1180 case AtomicOrdering::AcquireRelease: 1181 case AtomicOrdering::SequentiallyConsistent: 1182 IRB.CreateFence(AtomicOrdering::Release, SSID); 1183 break; 1184 default: 1185 break; 1186 } 1187 } 1188 1189 void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, 1190 SyncScope::ID SSID) { 1191 switch (Order) { 1192 case AtomicOrdering::Acquire: 1193 case AtomicOrdering::AcquireRelease: 1194 case AtomicOrdering::SequentiallyConsistent: 1195 IRB.CreateFence(AtomicOrdering::Acquire, SSID); 1196 break; 1197 default: 1198 break; 1199 } 1200 } 1201 1202 Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, 1203 Type *Ty, Align Alignment, 1204 AtomicOrdering Order, bool IsVolatile, 1205 SyncScope::ID SSID) { 1206 IRB.SetInsertPoint(I); 1207 1208 auto [Rsrc, Off] = getPtrParts(Ptr); 1209 SmallVector<Value *, 5> Args; 1210 if (Arg) 1211 Args.push_back(Arg); 1212 Args.push_back(Rsrc); 1213 Args.push_back(Off); 1214 insertPreMemOpFence(Order, SSID); 1215 // soffset is always 0 for these cases, where we always want any offset to be 1216 // part of bounds checking and we don't know which parts of the GEPs is 1217 // uniform. 1218 Args.push_back(IRB.getInt32(0)); 1219 1220 uint32_t Aux = 0; 1221 bool IsInvariant = 1222 (isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load)); 1223 bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal); 1224 // Atomic loads and stores need glc, atomic read-modify-write doesn't. 1225 bool IsOneWayAtomic = 1226 !isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic; 1227 if (IsOneWayAtomic) 1228 Aux |= AMDGPU::CPol::GLC; 1229 if (IsNonTemporal && !IsInvariant) 1230 Aux |= AMDGPU::CPol::SLC; 1231 if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10) 1232 Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0); 1233 if (IsVolatile) 1234 Aux |= AMDGPU::CPol::VOLATILE; 1235 Args.push_back(IRB.getInt32(Aux)); 1236 1237 Intrinsic::ID IID = Intrinsic::not_intrinsic; 1238 if (isa<LoadInst>(I)) 1239 // TODO: Do we need to do something about atomic loads? 1240 IID = Intrinsic::amdgcn_raw_ptr_buffer_load; 1241 else if (isa<StoreInst>(I)) 1242 IID = Intrinsic::amdgcn_raw_ptr_buffer_store; 1243 else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) { 1244 switch (RMW->getOperation()) { 1245 case AtomicRMWInst::Xchg: 1246 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; 1247 break; 1248 case AtomicRMWInst::Add: 1249 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; 1250 break; 1251 case AtomicRMWInst::Sub: 1252 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; 1253 break; 1254 case AtomicRMWInst::And: 1255 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; 1256 break; 1257 case AtomicRMWInst::Or: 1258 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; 1259 break; 1260 case AtomicRMWInst::Xor: 1261 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; 1262 break; 1263 case AtomicRMWInst::Max: 1264 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; 1265 break; 1266 case AtomicRMWInst::Min: 1267 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; 1268 break; 1269 case AtomicRMWInst::UMax: 1270 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; 1271 break; 1272 case AtomicRMWInst::UMin: 1273 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; 1274 break; 1275 case AtomicRMWInst::FAdd: 1276 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; 1277 break; 1278 case AtomicRMWInst::FMax: 1279 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; 1280 break; 1281 case AtomicRMWInst::FMin: 1282 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; 1283 break; 1284 case AtomicRMWInst::FSub: { 1285 report_fatal_error("atomic floating point subtraction not supported for " 1286 "buffer resources and should've been expanded away"); 1287 break; 1288 } 1289 case AtomicRMWInst::Nand: 1290 report_fatal_error("atomic nand not supported for buffer resources and " 1291 "should've been expanded away"); 1292 break; 1293 case AtomicRMWInst::UIncWrap: 1294 case AtomicRMWInst::UDecWrap: 1295 report_fatal_error("wrapping increment/decrement not supported for " 1296 "buffer resources and should've ben expanded away"); 1297 break; 1298 case AtomicRMWInst::BAD_BINOP: 1299 llvm_unreachable("Not sure how we got a bad binop"); 1300 } 1301 } 1302 1303 auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); 1304 copyMetadata(Call, I); 1305 setAlign(Call, Alignment, Arg ? 1 : 0); 1306 Call->takeName(I); 1307 1308 insertPostMemOpFence(Order, SSID); 1309 // The "no moving p7 directly" rewrites ensure that this load or store won't 1310 // itself need to be split into parts. 1311 SplitUsers.insert(I); 1312 I->replaceAllUsesWith(Call); 1313 return Call; 1314 } 1315 1316 PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { 1317 return {nullptr, nullptr}; 1318 } 1319 1320 PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { 1321 if (!isSplitFatPtr(LI.getPointerOperandType())) 1322 return {nullptr, nullptr}; 1323 handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), 1324 LI.getAlign(), LI.getOrdering(), LI.isVolatile(), 1325 LI.getSyncScopeID()); 1326 return {nullptr, nullptr}; 1327 } 1328 1329 PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { 1330 if (!isSplitFatPtr(SI.getPointerOperandType())) 1331 return {nullptr, nullptr}; 1332 Value *Arg = SI.getValueOperand(); 1333 handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), 1334 SI.getAlign(), SI.getOrdering(), SI.isVolatile(), 1335 SI.getSyncScopeID()); 1336 return {nullptr, nullptr}; 1337 } 1338 1339 PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { 1340 if (!isSplitFatPtr(AI.getPointerOperand()->getType())) 1341 return {nullptr, nullptr}; 1342 Value *Arg = AI.getValOperand(); 1343 handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), 1344 AI.getAlign(), AI.getOrdering(), AI.isVolatile(), 1345 AI.getSyncScopeID()); 1346 return {nullptr, nullptr}; 1347 } 1348 1349 // Unlike load, store, and RMW, cmpxchg needs special handling to account 1350 // for the boolean argument. 1351 PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { 1352 Value *Ptr = AI.getPointerOperand(); 1353 if (!isSplitFatPtr(Ptr->getType())) 1354 return {nullptr, nullptr}; 1355 IRB.SetInsertPoint(&AI); 1356 1357 Type *Ty = AI.getNewValOperand()->getType(); 1358 AtomicOrdering Order = AI.getMergedOrdering(); 1359 SyncScope::ID SSID = AI.getSyncScopeID(); 1360 bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); 1361 1362 auto [Rsrc, Off] = getPtrParts(Ptr); 1363 insertPreMemOpFence(Order, SSID); 1364 1365 uint32_t Aux = 0; 1366 if (IsNonTemporal) 1367 Aux |= AMDGPU::CPol::SLC; 1368 if (AI.isVolatile()) 1369 Aux |= AMDGPU::CPol::VOLATILE; 1370 auto *Call = 1371 IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, 1372 {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, 1373 Off, IRB.getInt32(0), IRB.getInt32(Aux)}); 1374 copyMetadata(Call, &AI); 1375 setAlign(Call, AI.getAlign(), 2); 1376 Call->takeName(&AI); 1377 insertPostMemOpFence(Order, SSID); 1378 1379 Value *Res = PoisonValue::get(AI.getType()); 1380 Res = IRB.CreateInsertValue(Res, Call, 0); 1381 if (!AI.isWeak()) { 1382 Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); 1383 Res = IRB.CreateInsertValue(Res, Succeeded, 1); 1384 } 1385 SplitUsers.insert(&AI); 1386 AI.replaceAllUsesWith(Res); 1387 return {nullptr, nullptr}; 1388 } 1389 1390 PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { 1391 Value *Ptr = GEP.getPointerOperand(); 1392 if (!isSplitFatPtr(Ptr->getType())) 1393 return {nullptr, nullptr}; 1394 IRB.SetInsertPoint(&GEP); 1395 1396 auto [Rsrc, Off] = getPtrParts(Ptr); 1397 const DataLayout &DL = GEP.getModule()->getDataLayout(); 1398 bool InBounds = GEP.isInBounds(); 1399 1400 // In order to call emitGEPOffset() and thus not have to reimplement it, 1401 // we need the GEP result to have ptr addrspace(7) type. 1402 Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); 1403 if (auto *VT = dyn_cast<VectorType>(Off->getType())) 1404 FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); 1405 GEP.mutateType(FatPtrTy); 1406 Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); 1407 GEP.mutateType(Ptr->getType()); 1408 if (!OffAccum) { // Constant-zero offset 1409 SplitUsers.insert(&GEP); 1410 return {Rsrc, Off}; 1411 } 1412 1413 bool HasNonNegativeOff = false; 1414 if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { 1415 HasNonNegativeOff = !CI->isNegative(); 1416 } 1417 Value *NewOff; 1418 if (PatternMatch::match(Off, PatternMatch::is_zero())) { 1419 NewOff = OffAccum; 1420 } else { 1421 NewOff = IRB.CreateAdd(Off, OffAccum, "", 1422 /*hasNUW=*/InBounds && HasNonNegativeOff, 1423 /*hasNSW=*/false); 1424 } 1425 copyMetadata(NewOff, &GEP); 1426 NewOff->takeName(&GEP); 1427 SplitUsers.insert(&GEP); 1428 return {Rsrc, NewOff}; 1429 } 1430 1431 PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { 1432 Value *Ptr = PI.getPointerOperand(); 1433 if (!isSplitFatPtr(Ptr->getType())) 1434 return {nullptr, nullptr}; 1435 IRB.SetInsertPoint(&PI); 1436 1437 Type *ResTy = PI.getType(); 1438 unsigned Width = ResTy->getScalarSizeInBits(); 1439 1440 auto [Rsrc, Off] = getPtrParts(Ptr); 1441 const DataLayout &DL = PI.getModule()->getDataLayout(); 1442 unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 1443 1444 Value *RsrcInt; 1445 if (Width <= BufferOffsetWidth) 1446 RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width)); 1447 else 1448 RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); 1449 copyMetadata(RsrcInt, &PI); 1450 1451 Value *Shl = IRB.CreateShl( 1452 RsrcInt, 1453 ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", 1454 Width >= FatPtrWidth, Width > FatPtrWidth); 1455 Value *OffCast = 1456 IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); 1457 Value *Res = IRB.CreateOr(Shl, OffCast); 1458 Res->takeName(&PI); 1459 SplitUsers.insert(&PI); 1460 PI.replaceAllUsesWith(Res); 1461 return {nullptr, nullptr}; 1462 } 1463 1464 PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { 1465 if (!isSplitFatPtr(IP.getType())) 1466 return {nullptr, nullptr}; 1467 IRB.SetInsertPoint(&IP); 1468 const DataLayout &DL = IP.getModule()->getDataLayout(); 1469 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); 1470 Value *Int = IP.getOperand(0); 1471 Type *IntTy = Int->getType(); 1472 Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); 1473 unsigned Width = IntTy->getScalarSizeInBits(); 1474 1475 auto *RetTy = cast<StructType>(IP.getType()); 1476 Type *RsrcTy = RetTy->getElementType(0); 1477 Type *OffTy = RetTy->getElementType(1); 1478 Value *RsrcPart = IRB.CreateLShr( 1479 Int, 1480 ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); 1481 Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); 1482 Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); 1483 Value *Off = 1484 IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); 1485 1486 copyMetadata(Rsrc, &IP); 1487 SplitUsers.insert(&IP); 1488 return {Rsrc, Off}; 1489 } 1490 1491 PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { 1492 if (!isSplitFatPtr(I.getType())) 1493 return {nullptr, nullptr}; 1494 IRB.SetInsertPoint(&I); 1495 Value *In = I.getPointerOperand(); 1496 // No-op casts preserve parts 1497 if (In->getType() == I.getType()) { 1498 auto [Rsrc, Off] = getPtrParts(In); 1499 SplitUsers.insert(&I); 1500 return {Rsrc, Off}; 1501 } 1502 if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) 1503 report_fatal_error("Only buffer resources (addrspace 8) can be cast to " 1504 "buffer fat pointers (addrspace 7)"); 1505 Type *OffTy = cast<StructType>(I.getType())->getElementType(1); 1506 Value *ZeroOff = Constant::getNullValue(OffTy); 1507 SplitUsers.insert(&I); 1508 return {In, ZeroOff}; 1509 } 1510 1511 PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { 1512 Value *Lhs = Cmp.getOperand(0); 1513 if (!isSplitFatPtr(Lhs->getType())) 1514 return {nullptr, nullptr}; 1515 Value *Rhs = Cmp.getOperand(1); 1516 IRB.SetInsertPoint(&Cmp); 1517 ICmpInst::Predicate Pred = Cmp.getPredicate(); 1518 1519 assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && 1520 "Pointer comparison is only equal or unequal"); 1521 auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); 1522 auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); 1523 Value *RsrcCmp = 1524 IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); 1525 copyMetadata(RsrcCmp, &Cmp); 1526 Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); 1527 copyMetadata(OffCmp, &Cmp); 1528 1529 Value *Res = nullptr; 1530 if (Pred == ICmpInst::ICMP_EQ) 1531 Res = IRB.CreateAnd(RsrcCmp, OffCmp); 1532 else if (Pred == ICmpInst::ICMP_NE) 1533 Res = IRB.CreateOr(RsrcCmp, OffCmp); 1534 copyMetadata(Res, &Cmp); 1535 Res->takeName(&Cmp); 1536 SplitUsers.insert(&Cmp); 1537 Cmp.replaceAllUsesWith(Res); 1538 return {nullptr, nullptr}; 1539 } 1540 1541 PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { 1542 if (!isSplitFatPtr(I.getType())) 1543 return {nullptr, nullptr}; 1544 IRB.SetInsertPoint(&I); 1545 auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); 1546 1547 Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); 1548 copyMetadata(RsrcRes, &I); 1549 Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); 1550 copyMetadata(OffRes, &I); 1551 SplitUsers.insert(&I); 1552 return {RsrcRes, OffRes}; 1553 } 1554 1555 PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { 1556 if (!isSplitFatPtr(I.getType())) 1557 return {nullptr, nullptr}; 1558 IRB.SetInsertPoint(&I); 1559 Value *Vec = I.getVectorOperand(); 1560 Value *Idx = I.getIndexOperand(); 1561 auto [Rsrc, Off] = getPtrParts(Vec); 1562 1563 Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); 1564 copyMetadata(RsrcRes, &I); 1565 Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); 1566 copyMetadata(OffRes, &I); 1567 SplitUsers.insert(&I); 1568 return {RsrcRes, OffRes}; 1569 } 1570 1571 PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { 1572 // The mutated instructions temporarily don't return vectors, and so 1573 // we need the generic getType() here to avoid crashes. 1574 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1575 return {nullptr, nullptr}; 1576 IRB.SetInsertPoint(&I); 1577 Value *Vec = I.getOperand(0); 1578 Value *Elem = I.getOperand(1); 1579 Value *Idx = I.getOperand(2); 1580 auto [VecRsrc, VecOff] = getPtrParts(Vec); 1581 auto [ElemRsrc, ElemOff] = getPtrParts(Elem); 1582 1583 Value *RsrcRes = 1584 IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); 1585 copyMetadata(RsrcRes, &I); 1586 Value *OffRes = 1587 IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); 1588 copyMetadata(OffRes, &I); 1589 SplitUsers.insert(&I); 1590 return {RsrcRes, OffRes}; 1591 } 1592 1593 PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { 1594 // Cast is needed for the same reason as insertelement's. 1595 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1596 return {nullptr, nullptr}; 1597 IRB.SetInsertPoint(&I); 1598 1599 Value *V1 = I.getOperand(0); 1600 Value *V2 = I.getOperand(1); 1601 ArrayRef<int> Mask = I.getShuffleMask(); 1602 auto [V1Rsrc, V1Off] = getPtrParts(V1); 1603 auto [V2Rsrc, V2Off] = getPtrParts(V2); 1604 1605 Value *RsrcRes = 1606 IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); 1607 copyMetadata(RsrcRes, &I); 1608 Value *OffRes = 1609 IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); 1610 copyMetadata(OffRes, &I); 1611 SplitUsers.insert(&I); 1612 return {RsrcRes, OffRes}; 1613 } 1614 1615 PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { 1616 if (!isSplitFatPtr(PHI.getType())) 1617 return {nullptr, nullptr}; 1618 IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef()); 1619 // Phi nodes will be handled in post-processing after we've visited every 1620 // instruction. However, instead of just returning {nullptr, nullptr}, 1621 // we explicitly create the temporary extractvalue operations that are our 1622 // temporary results so that they end up at the beginning of the block with 1623 // the PHIs. 1624 Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); 1625 Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); 1626 Conditionals.push_back(&PHI); 1627 SplitUsers.insert(&PHI); 1628 return {TmpRsrc, TmpOff}; 1629 } 1630 1631 PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { 1632 if (!isSplitFatPtr(SI.getType())) 1633 return {nullptr, nullptr}; 1634 IRB.SetInsertPoint(&SI); 1635 1636 Value *Cond = SI.getCondition(); 1637 Value *True = SI.getTrueValue(); 1638 Value *False = SI.getFalseValue(); 1639 auto [TrueRsrc, TrueOff] = getPtrParts(True); 1640 auto [FalseRsrc, FalseOff] = getPtrParts(False); 1641 1642 Value *RsrcRes = 1643 IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); 1644 copyMetadata(RsrcRes, &SI); 1645 Conditionals.push_back(&SI); 1646 Value *OffRes = 1647 IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); 1648 copyMetadata(OffRes, &SI); 1649 SplitUsers.insert(&SI); 1650 return {RsrcRes, OffRes}; 1651 } 1652 1653 /// Returns true if this intrinsic needs to be removed when it is 1654 /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are 1655 /// rewritten into calls to versions of that intrinsic on the resource 1656 /// descriptor. 1657 static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { 1658 switch (IID) { 1659 default: 1660 return false; 1661 case Intrinsic::ptrmask: 1662 case Intrinsic::invariant_start: 1663 case Intrinsic::invariant_end: 1664 case Intrinsic::launder_invariant_group: 1665 case Intrinsic::strip_invariant_group: 1666 return true; 1667 } 1668 } 1669 1670 PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { 1671 Intrinsic::ID IID = I.getIntrinsicID(); 1672 switch (IID) { 1673 default: 1674 break; 1675 case Intrinsic::ptrmask: { 1676 Value *Ptr = I.getArgOperand(0); 1677 if (!isSplitFatPtr(Ptr->getType())) 1678 return {nullptr, nullptr}; 1679 Value *Mask = I.getArgOperand(1); 1680 IRB.SetInsertPoint(&I); 1681 auto [Rsrc, Off] = getPtrParts(Ptr); 1682 if (Mask->getType() != Off->getType()) 1683 report_fatal_error("offset width is not equal to index width of fat " 1684 "pointer (data layout not set up correctly?)"); 1685 Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off"); 1686 copyMetadata(OffRes, &I); 1687 SplitUsers.insert(&I); 1688 return {Rsrc, OffRes}; 1689 } 1690 // Pointer annotation intrinsics that, given their object-wide nature 1691 // operate on the resource part. 1692 case Intrinsic::invariant_start: { 1693 Value *Ptr = I.getArgOperand(1); 1694 if (!isSplitFatPtr(Ptr->getType())) 1695 return {nullptr, nullptr}; 1696 IRB.SetInsertPoint(&I); 1697 auto [Rsrc, Off] = getPtrParts(Ptr); 1698 Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); 1699 auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); 1700 copyMetadata(NewRsrc, &I); 1701 NewRsrc->takeName(&I); 1702 SplitUsers.insert(&I); 1703 I.replaceAllUsesWith(NewRsrc); 1704 return {nullptr, nullptr}; 1705 } 1706 case Intrinsic::invariant_end: { 1707 Value *RealPtr = I.getArgOperand(2); 1708 if (!isSplitFatPtr(RealPtr->getType())) 1709 return {nullptr, nullptr}; 1710 IRB.SetInsertPoint(&I); 1711 Value *RealRsrc = getPtrParts(RealPtr).first; 1712 Value *InvPtr = I.getArgOperand(0); 1713 Value *Size = I.getArgOperand(1); 1714 Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, 1715 {InvPtr, Size, RealRsrc}); 1716 copyMetadata(NewRsrc, &I); 1717 NewRsrc->takeName(&I); 1718 SplitUsers.insert(&I); 1719 I.replaceAllUsesWith(NewRsrc); 1720 return {nullptr, nullptr}; 1721 } 1722 case Intrinsic::launder_invariant_group: 1723 case Intrinsic::strip_invariant_group: { 1724 Value *Ptr = I.getArgOperand(0); 1725 if (!isSplitFatPtr(Ptr->getType())) 1726 return {nullptr, nullptr}; 1727 IRB.SetInsertPoint(&I); 1728 auto [Rsrc, Off] = getPtrParts(Ptr); 1729 Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); 1730 copyMetadata(NewRsrc, &I); 1731 NewRsrc->takeName(&I); 1732 SplitUsers.insert(&I); 1733 return {NewRsrc, Off}; 1734 } 1735 } 1736 return {nullptr, nullptr}; 1737 } 1738 1739 void SplitPtrStructs::processFunction(Function &F) { 1740 ST = &TM->getSubtarget<GCNSubtarget>(F); 1741 SmallVector<Instruction *, 0> Originals; 1742 LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName() 1743 << "\n"); 1744 for (Instruction &I : instructions(F)) 1745 Originals.push_back(&I); 1746 for (Instruction *I : Originals) { 1747 auto [Rsrc, Off] = visit(I); 1748 assert(((Rsrc && Off) || (!Rsrc && !Off)) && 1749 "Can't have a resource but no offset"); 1750 if (Rsrc) 1751 RsrcParts[I] = Rsrc; 1752 if (Off) 1753 OffParts[I] = Off; 1754 } 1755 processConditionals(); 1756 killAndReplaceSplitInstructions(Originals); 1757 1758 // Clean up after ourselves to save on memory. 1759 RsrcParts.clear(); 1760 OffParts.clear(); 1761 SplitUsers.clear(); 1762 Conditionals.clear(); 1763 ConditionalTemps.clear(); 1764 } 1765 1766 namespace { 1767 class AMDGPULowerBufferFatPointers : public ModulePass { 1768 public: 1769 static char ID; 1770 1771 AMDGPULowerBufferFatPointers() : ModulePass(ID) { 1772 initializeAMDGPULowerBufferFatPointersPass( 1773 *PassRegistry::getPassRegistry()); 1774 } 1775 1776 bool run(Module &M, const TargetMachine &TM); 1777 bool runOnModule(Module &M) override; 1778 1779 void getAnalysisUsage(AnalysisUsage &AU) const override; 1780 }; 1781 } // namespace 1782 1783 /// Returns true if there are values that have a buffer fat pointer in them, 1784 /// which means we'll need to perform rewrites on this function. As a side 1785 /// effect, this will populate the type remapping cache. 1786 static bool containsBufferFatPointers(const Function &F, 1787 BufferFatPtrToStructTypeMap *TypeMap) { 1788 bool HasFatPointers = false; 1789 for (const BasicBlock &BB : F) { 1790 for (const Instruction &I : BB) { 1791 HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); 1792 for (const Use &U : I.operands()) 1793 if (auto *C = dyn_cast<Constant>(U.get())) 1794 HasFatPointers |= isBufferFatPtrConst(C); 1795 } 1796 } 1797 return HasFatPointers; 1798 } 1799 1800 static bool hasFatPointerInterface(const Function &F, 1801 BufferFatPtrToStructTypeMap *TypeMap) { 1802 Type *Ty = F.getFunctionType(); 1803 return Ty != TypeMap->remapType(Ty); 1804 } 1805 1806 /// Move the body of `OldF` into a new function, returning it. 1807 static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, 1808 ValueToValueMapTy &CloneMap) { 1809 bool IsIntrinsic = OldF->isIntrinsic(); 1810 Function *NewF = 1811 Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); 1812 NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat; 1813 NewF->copyAttributesFrom(OldF); 1814 NewF->copyMetadata(OldF, 0); 1815 NewF->takeName(OldF); 1816 NewF->updateAfterNameChange(); 1817 NewF->setDLLStorageClass(OldF->getDLLStorageClass()); 1818 OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); 1819 1820 while (!OldF->empty()) { 1821 BasicBlock *BB = &OldF->front(); 1822 BB->removeFromParent(); 1823 BB->insertInto(NewF); 1824 CloneMap[BB] = BB; 1825 for (Instruction &I : *BB) { 1826 CloneMap[&I] = &I; 1827 } 1828 } 1829 1830 AttributeMask PtrOnlyAttrs; 1831 for (auto K : 1832 {Attribute::Dereferenceable, Attribute::DereferenceableOrNull, 1833 Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree, 1834 Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone, 1835 Attribute::ReadOnly, Attribute::WriteOnly}) { 1836 PtrOnlyAttrs.addAttribute(K); 1837 } 1838 SmallVector<AttributeSet> ArgAttrs; 1839 AttributeList OldAttrs = OldF->getAttributes(); 1840 1841 for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) { 1842 CloneMap[&NewArg] = &OldArg; 1843 NewArg.takeName(&OldArg); 1844 Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); 1845 // Temporarily mutate type of `NewArg` to allow RAUW to work. 1846 NewArg.mutateType(OldArgTy); 1847 OldArg.replaceAllUsesWith(&NewArg); 1848 NewArg.mutateType(NewArgTy); 1849 1850 AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); 1851 // Intrinsics get their attributes fixed later. 1852 if (OldArgTy != NewArgTy && !IsIntrinsic) 1853 ArgAttr = ArgAttr.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1854 ArgAttrs.push_back(ArgAttr); 1855 } 1856 AttributeSet RetAttrs = OldAttrs.getRetAttrs(); 1857 if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) 1858 RetAttrs = RetAttrs.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1859 NewF->setAttributes(AttributeList::get( 1860 NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); 1861 return NewF; 1862 } 1863 1864 static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { 1865 for (Argument &A : F->args()) 1866 CloneMap[&A] = &A; 1867 for (BasicBlock &BB : *F) { 1868 CloneMap[&BB] = &BB; 1869 for (Instruction &I : BB) 1870 CloneMap[&I] = &I; 1871 } 1872 } 1873 1874 bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { 1875 bool Changed = false; 1876 const DataLayout &DL = M.getDataLayout(); 1877 // Record the functions which need to be remapped. 1878 // The second element of the pair indicates whether the function has to have 1879 // its arguments or return types adjusted. 1880 SmallVector<std::pair<Function *, bool>> NeedsRemap; 1881 1882 BufferFatPtrToStructTypeMap StructTM(DL); 1883 BufferFatPtrToIntTypeMap IntTM(DL); 1884 for (const GlobalVariable &GV : M.globals()) { 1885 if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) 1886 report_fatal_error("Global variables with a buffer fat pointer address " 1887 "space (7) are not supported"); 1888 Type *VT = GV.getValueType(); 1889 if (VT != StructTM.remapType(VT)) 1890 report_fatal_error("Global variables that contain buffer fat pointers " 1891 "(address space 7 pointers) are unsupported. Use " 1892 "buffer resource pointers (address space 8) instead."); 1893 } 1894 1895 StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); 1896 for (Function &F : M.functions()) { 1897 bool InterfaceChange = hasFatPointerInterface(F, &StructTM); 1898 bool BodyChanges = containsBufferFatPointers(F, &StructTM); 1899 Changed |= MemOpsRewrite.processFunction(F); 1900 if (InterfaceChange || BodyChanges) 1901 NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); 1902 } 1903 if (NeedsRemap.empty()) 1904 return Changed; 1905 1906 SmallVector<Function *> NeedsPostProcess; 1907 SmallVector<Function *> Intrinsics; 1908 // Keep one big map so as to memoize constants across functions. 1909 ValueToValueMapTy CloneMap; 1910 FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL); 1911 1912 ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); 1913 for (auto [F, InterfaceChange] : NeedsRemap) { 1914 Function *NewF = F; 1915 if (InterfaceChange) 1916 NewF = moveFunctionAdaptingType( 1917 F, cast<FunctionType>(StructTM.remapType(F->getFunctionType())), 1918 CloneMap); 1919 else 1920 makeCloneInPraceMap(F, CloneMap); 1921 LowerInFuncs.remapFunction(*NewF); 1922 if (NewF->isIntrinsic()) 1923 Intrinsics.push_back(NewF); 1924 else 1925 NeedsPostProcess.push_back(NewF); 1926 if (InterfaceChange) { 1927 F->replaceAllUsesWith(NewF); 1928 F->eraseFromParent(); 1929 } 1930 Changed = true; 1931 } 1932 StructTM.clear(); 1933 IntTM.clear(); 1934 CloneMap.clear(); 1935 1936 SplitPtrStructs Splitter(M.getContext(), &TM); 1937 for (Function *F : NeedsPostProcess) 1938 Splitter.processFunction(*F); 1939 for (Function *F : Intrinsics) { 1940 if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { 1941 F->eraseFromParent(); 1942 } else { 1943 std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F); 1944 if (NewF) 1945 F->replaceAllUsesWith(*NewF); 1946 } 1947 } 1948 return Changed; 1949 } 1950 1951 bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { 1952 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 1953 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 1954 return run(M, TM); 1955 } 1956 1957 char AMDGPULowerBufferFatPointers::ID = 0; 1958 1959 char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; 1960 1961 void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { 1962 AU.addRequired<TargetPassConfig>(); 1963 } 1964 1965 #define PASS_DESC "Lower buffer fat pointer operations to buffer resources" 1966 INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, 1967 false, false) 1968 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 1969 INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, 1970 false) 1971 #undef PASS_DESC 1972 1973 ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { 1974 return new AMDGPULowerBufferFatPointers(); 1975 } 1976 1977 PreservedAnalyses 1978 AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { 1979 return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() 1980 : PreservedAnalyses::all(); 1981 } 1982