1 //===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers operations on buffer fat pointers (addrspace 7) to 10 // operations on buffer resources (addrspace 8) and is needed for correct 11 // codegen. 12 // 13 // # Background 14 // 15 // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists 16 // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. 17 // The buffer resource part needs to be it needs to be a "raw" buffer resource 18 // (it must have a stride of 0 and bounds checks must be in raw buffer mode 19 // or disabled). 20 // 21 // When these requirements are met, a buffer resource can be treated as a 22 // typical (though quite wide) pointer that follows typical LLVM pointer 23 // semantics. This allows the frontend to reason about such buffers (which are 24 // often encountered in the context of SPIR-V kernels). 25 // 26 // However, because of their non-power-of-2 size, these fat pointers cannot be 27 // present during translation to MIR (though this restriction may be lifted 28 // during the transition to GlobalISel). Therefore, this pass is needed in order 29 // to correctly implement these fat pointers. 30 // 31 // The resource intrinsics take the resource part (the address space 8 pointer) 32 // and the offset part (the 32-bit integer) as separate arguments. In addition, 33 // many users of these buffers manipulate the offset while leaving the resource 34 // part alone. For these reasons, we want to typically separate the resource 35 // and offset parts into separate variables, but combine them together when 36 // encountering cases where this is required, such as by inserting these values 37 // into aggretates or moving them to memory. 38 // 39 // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) 40 // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), 41 // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes 42 // `{vector<Nxp8>, vector<Nxi32 >}` and its component parts. 43 // 44 // # Implementation 45 // 46 // This pass proceeds in three main phases: 47 // 48 // ## Rewriting loads and stores of p7 49 // 50 // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, 51 // including aggregates containing such pointers, to ones that use `i160`. This 52 // is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and 53 // allocas and, if the loaded or stored type contains `ptr addrspace(7)`, 54 // rewrites that type to one where the p7s are replaced by i160s, copying other 55 // parts of aggregates as needed. In the case of a store, each pointer is 56 // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. 57 // This same transformation is applied to vectors of pointers. 58 // 59 // Such a transformation allows the later phases of the pass to not need 60 // to handle buffer fat pointers moving to and from memory, where we load 61 // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation 62 // and `Nxi60` directly. Instead, that transposing action (where the vectors 63 // of resources and vectors of offsets are concatentated before being stored to 64 // memory) are handled through implementing `inttoptr` and `ptrtoint` only. 65 // 66 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the 67 // hardware does not include a 160-bit atomic. 68 // 69 // ## Type remapping 70 // 71 // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers 72 // to the corresponding struct type, which has a resource part and an offset 73 // part. 74 // 75 // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` 76 // to, usually by way of `setType`ing values. Constants are handled here 77 // because there isn't a good way to fix them up later. 78 // 79 // This has the downside of leaving the IR in an invalid state (for example, 80 // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), 81 // but all such invalid states will be resolved by the third phase. 82 // 83 // Functions that don't take buffer fat pointers are modified in place. Those 84 // that do take such pointers have their basic blocks moved to a new function 85 // with arguments that are {ptr addrspace(8), i32} arguments and return values. 86 // This phase also records intrinsics so that they can be remangled or deleted 87 // later. 88 // 89 // 90 // ## Splitting pointer structs 91 // 92 // The meat of this pass consists of defining semantics for operations that 93 // produce or consume [vectors of] buffer fat pointers in terms of their 94 // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` 95 // visitor. 96 // 97 // In the first pass through each function that is being lowered, the splitter 98 // inserts new instructions to implement the split-structures behavior, which is 99 // needed for correctness and performance. It records a list of "split users", 100 // instructions that are being replaced by operations on the resource and offset 101 // parts. 102 // 103 // Split users do not necessarily need to produce parts themselves ( 104 // a `load float, ptr addrspace(7)` does not, for example), but, if they do not 105 // generate fat buffer pointers, they must RAUW in their replacement 106 // instructions during the initial visit. 107 // 108 // When these new instructions are created, they use the split parts recorded 109 // for their initial arguments in order to generate their replacements, creating 110 // a parallel set of instructions that does not refer to the original fat 111 // pointer values but instead to their resource and offset components. 112 // 113 // Instructions, such as `extractvalue`, that produce buffer fat pointers from 114 // sources that do not have split parts, have such parts generated using 115 // `extractvalue`. This is also the initial handling of PHI nodes, which 116 // are then cleaned up. 117 // 118 // ### Conditionals 119 // 120 // PHI nodes are initially given resource parts via `extractvalue`. However, 121 // this is not an efficient rewrite of such nodes, as, in most cases, the 122 // resource part in a conditional or loop remains constant throughout the loop 123 // and only the offset varies. Failing to optimize away these constant resources 124 // would cause additional registers to be sent around loops and might lead to 125 // waterfall loops being generated for buffer operations due to the 126 // "non-uniform" resource argument. 127 // 128 // Therefore, after all instructions have been visited, the pointer splitter 129 // post-processes all encountered conditionals. Given a PHI node or select, 130 // getPossibleRsrcRoots() collects all values that the resource parts of that 131 // conditional's input could come from as well as collecting all conditional 132 // instructions encountered during the search. If, after filtering out the 133 // initial node itself, the set of encountered conditionals is a subset of the 134 // potential roots and there is a single potential resource that isn't in the 135 // conditional set, that value is the only possible value the resource argument 136 // could have throughout the control flow. 137 // 138 // If that condition is met, then a PHI node can have its resource part changed 139 // to the singleton value and then be replaced by a PHI on the offsets. 140 // Otherwise, each PHI node is split into two, one for the resource part and one 141 // for the offset part, which replace the temporary `extractvalue` instructions 142 // that were added during the first pass. 143 // 144 // Similar logic applies to `select`, where 145 // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` 146 // can be split into `%z.rsrc = %x.rsrc` and 147 // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` 148 // if both `%x` and `%y` have the same resource part, but two `select` 149 // operations will be needed if they do not. 150 // 151 // ### Final processing 152 // 153 // After conditionals have been cleaned up, the IR for each function is 154 // rewritten to remove all the old instructions that have been split up. 155 // 156 // Any instruction that used to produce a buffer fat pointer (and therefore now 157 // produces a resource-and-offset struct after type remapping) is 158 // replaced as follows: 159 // 1. All debug value annotations are cloned to reflect that the resource part 160 // and offset parts are computed separately and constitute different 161 // fragments of the underlying source language variable. 162 // 2. All uses that were themselves split are replaced by a `poison` of the 163 // struct type, as they will themselves be erased soon. This rule, combined 164 // with debug handling, should leave the use lists of split instructions 165 // empty in almost all cases. 166 // 3. If a user of the original struct-valued result remains, the structure 167 // needed for the new types to work is constructed out of the newly-defined 168 // parts, and the original instruction is replaced by this structure 169 // before being erased. Instructions requiring this construction include 170 // `ret` and `insertvalue`. 171 // 172 // # Consequences 173 // 174 // This pass does not alter the CFG. 175 // 176 // Alias analysis information will become coarser, as the LLVM alias analyzer 177 // cannot handle the buffer intrinsics. Specifically, while we can determine 178 // that the following two loads do not alias: 179 // ``` 180 // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 181 // %a = load i32, ptr addrspace(7) %x 182 // %b = load i32, ptr addrspace(7) %y 183 // ``` 184 // we cannot (except through some code that runs during scheduling) determine 185 // that the rewritten loads below do not alias. 186 // ``` 187 // %y.off = add i32 %x.off, 1 188 // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 189 // %x.off, ...) 190 // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) 191 // %x.rsrc, i32 %y.off, ...) 192 // ``` 193 // However, existing alias information is preserved. 194 //===----------------------------------------------------------------------===// 195 196 #include "AMDGPU.h" 197 #include "AMDGPUTargetMachine.h" 198 #include "GCNSubtarget.h" 199 #include "SIDefines.h" 200 #include "llvm/ADT/SetOperations.h" 201 #include "llvm/ADT/SmallVector.h" 202 #include "llvm/Analysis/ConstantFolding.h" 203 #include "llvm/Analysis/Utils/Local.h" 204 #include "llvm/CodeGen/TargetPassConfig.h" 205 #include "llvm/IR/AttributeMask.h" 206 #include "llvm/IR/Constants.h" 207 #include "llvm/IR/DebugInfo.h" 208 #include "llvm/IR/DerivedTypes.h" 209 #include "llvm/IR/IRBuilder.h" 210 #include "llvm/IR/InstIterator.h" 211 #include "llvm/IR/InstVisitor.h" 212 #include "llvm/IR/Instructions.h" 213 #include "llvm/IR/Intrinsics.h" 214 #include "llvm/IR/IntrinsicsAMDGPU.h" 215 #include "llvm/IR/Metadata.h" 216 #include "llvm/IR/Operator.h" 217 #include "llvm/IR/PatternMatch.h" 218 #include "llvm/InitializePasses.h" 219 #include "llvm/Pass.h" 220 #include "llvm/Support/AtomicOrdering.h" 221 #include "llvm/Support/Debug.h" 222 #include "llvm/Support/ErrorHandling.h" 223 #include "llvm/Transforms/Utils/Cloning.h" 224 #include "llvm/Transforms/Utils/Local.h" 225 #include "llvm/Transforms/Utils/ValueMapper.h" 226 227 #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" 228 229 using namespace llvm; 230 231 static constexpr unsigned BufferOffsetWidth = 32; 232 233 namespace { 234 /// Recursively replace instances of ptr addrspace(7) and vector<Nxptr 235 /// addrspace(7)> with some other type as defined by the relevant subclass. 236 class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { 237 DenseMap<Type *, Type *> Map; 238 239 Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl<StructType *> &Seen); 240 241 protected: 242 virtual Type *remapScalar(PointerType *PT) = 0; 243 virtual Type *remapVector(VectorType *VT) = 0; 244 245 const DataLayout &DL; 246 247 public: 248 BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} 249 Type *remapType(Type *SrcTy) override; 250 void clear() { Map.clear(); } 251 }; 252 253 /// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to 254 /// vector<Nxi60> in order to correctly handling loading/storing these values 255 /// from memory. 256 class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { 257 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 258 259 protected: 260 Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } 261 Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } 262 }; 263 264 /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset 265 /// parts of the pointer) so that we can easily rewrite operations on these 266 /// values that aren't loading them from or storing them to memory. 267 class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { 268 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 269 270 protected: 271 Type *remapScalar(PointerType *PT) override; 272 Type *remapVector(VectorType *VT) override; 273 }; 274 } // namespace 275 276 // This code is adapted from the type remapper in lib/Linker/IRMover.cpp 277 Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( 278 Type *Ty, SmallPtrSetImpl<StructType *> &Seen) { 279 Type **Entry = &Map[Ty]; 280 if (*Entry) 281 return *Entry; 282 if (auto *PT = dyn_cast<PointerType>(Ty)) { 283 if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 284 return *Entry = remapScalar(PT); 285 } 286 } 287 if (auto *VT = dyn_cast<VectorType>(Ty)) { 288 auto *PT = dyn_cast<PointerType>(VT->getElementType()); 289 if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 290 return *Entry = remapVector(VT); 291 } 292 return *Entry = Ty; 293 } 294 // Whether the type is one that is structurally uniqued - that is, if it is 295 // not a named struct (the only kind of type where multiple structurally 296 // identical types that have a distinct `Type*`) 297 StructType *TyAsStruct = dyn_cast<StructType>(Ty); 298 bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral(); 299 // Base case for ints, floats, opaque pointers, and so on, which don't 300 // require recursion. 301 if (Ty->getNumContainedTypes() == 0 && IsUniqued) 302 return *Entry = Ty; 303 if (!IsUniqued) { 304 // Create a dummy type for recursion purposes. 305 if (!Seen.insert(TyAsStruct).second) { 306 StructType *Placeholder = StructType::create(Ty->getContext()); 307 return *Entry = Placeholder; 308 } 309 } 310 bool Changed = false; 311 SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr); 312 for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { 313 Type *OldElem = Ty->getContainedType(I); 314 Type *NewElem = remapTypeImpl(OldElem, Seen); 315 ElementTypes[I] = NewElem; 316 Changed |= (OldElem != NewElem); 317 } 318 // Recursive calls to remapTypeImpl() may have invalidated pointer. 319 Entry = &Map[Ty]; 320 if (!Changed) { 321 return *Entry = Ty; 322 } 323 if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) 324 return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); 325 if (auto *FnTy = dyn_cast<FunctionType>(Ty)) 326 return *Entry = FunctionType::get(ElementTypes[0], 327 ArrayRef(ElementTypes).slice(1), 328 FnTy->isVarArg()); 329 if (auto *STy = dyn_cast<StructType>(Ty)) { 330 // Genuine opaque types don't have a remapping. 331 if (STy->isOpaque()) 332 return *Entry = Ty; 333 bool IsPacked = STy->isPacked(); 334 if (IsUniqued) 335 return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); 336 SmallString<16> Name(STy->getName()); 337 STy->setName(""); 338 Type **RecursionEntry = &Map[Ty]; 339 if (*RecursionEntry) { 340 auto *Placeholder = cast<StructType>(*RecursionEntry); 341 Placeholder->setBody(ElementTypes, IsPacked); 342 Placeholder->setName(Name); 343 return *Entry = Placeholder; 344 } 345 return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, 346 IsPacked); 347 } 348 llvm_unreachable("Unknown type of type that contains elements"); 349 } 350 351 Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { 352 SmallPtrSet<StructType *, 2> Visited; 353 return remapTypeImpl(SrcTy, Visited); 354 } 355 356 Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { 357 LLVMContext &Ctx = PT->getContext(); 358 return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), 359 IntegerType::get(Ctx, BufferOffsetWidth)); 360 } 361 362 Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { 363 ElementCount EC = VT->getElementCount(); 364 LLVMContext &Ctx = VT->getContext(); 365 Type *RsrcVec = 366 VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); 367 Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); 368 return StructType::get(RsrcVec, OffVec); 369 } 370 371 static bool isBufferFatPtrOrVector(Type *Ty) { 372 if (auto *PT = dyn_cast<PointerType>(Ty->getScalarType())) 373 return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; 374 return false; 375 } 376 377 // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of 378 // those types. Used to quickly skip instructions we don't need to process. 379 static bool isSplitFatPtr(Type *Ty) { 380 auto *ST = dyn_cast<StructType>(Ty); 381 if (!ST) 382 return false; 383 if (!ST->isLiteral() || ST->getNumElements() != 2) 384 return false; 385 auto *MaybeRsrc = 386 dyn_cast<PointerType>(ST->getElementType(0)->getScalarType()); 387 auto *MaybeOff = 388 dyn_cast<IntegerType>(ST->getElementType(1)->getScalarType()); 389 return MaybeRsrc && MaybeOff && 390 MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && 391 MaybeOff->getBitWidth() == BufferOffsetWidth; 392 } 393 394 // True if the result type or any argument types are buffer fat pointers. 395 static bool isBufferFatPtrConst(Constant *C) { 396 Type *T = C->getType(); 397 return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) { 398 return isBufferFatPtrOrVector(U.get()->getType()); 399 }); 400 } 401 402 namespace { 403 /// Convert [vectors of] buffer fat pointers to integers when they are read from 404 /// or stored to memory. This ensures that these pointers will have the same 405 /// memory layout as before they are lowered, even though they will no longer 406 /// have their previous layout in registers/in the program (they'll be broken 407 /// down into resource and offset parts). This has the downside of imposing 408 /// marshalling costs when reading or storing these values, but since placing 409 /// such pointers into memory is an uncommon operation at best, we feel that 410 /// this cost is acceptable for better performance in the common case. 411 class StoreFatPtrsAsIntsVisitor 412 : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> { 413 BufferFatPtrToIntTypeMap *TypeMap; 414 415 ValueToValueMapTy ConvertedForStore; 416 417 IRBuilder<> IRB; 418 419 // Convert all the buffer fat pointers within the input value to inttegers 420 // so that it can be stored in memory. 421 Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); 422 // Convert all the i160s that need to be buffer fat pointers (as specified) 423 // by the To type) into those pointers to preserve the semantics of the rest 424 // of the program. 425 Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); 426 427 public: 428 StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) 429 : TypeMap(TypeMap), IRB(Ctx) {} 430 bool processFunction(Function &F); 431 432 bool visitInstruction(Instruction &I) { return false; } 433 bool visitAllocaInst(AllocaInst &I); 434 bool visitLoadInst(LoadInst &LI); 435 bool visitStoreInst(StoreInst &SI); 436 bool visitGetElementPtrInst(GetElementPtrInst &I); 437 }; 438 } // namespace 439 440 Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, 441 const Twine &Name) { 442 if (From == To) 443 return V; 444 ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); 445 if (Find != ConvertedForStore.end()) 446 return Find->second; 447 if (isBufferFatPtrOrVector(From)) { 448 Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); 449 ConvertedForStore[V] = Cast; 450 return Cast; 451 } 452 if (From->getNumContainedTypes() == 0) 453 return V; 454 // Structs, arrays, and other compound types. 455 Value *Ret = PoisonValue::get(To); 456 if (auto *AT = dyn_cast<ArrayType>(From)) { 457 Type *FromPart = AT->getArrayElementType(); 458 Type *ToPart = cast<ArrayType>(To)->getElementType(); 459 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 460 Value *Field = IRB.CreateExtractValue(V, I); 461 Value *NewField = 462 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); 463 Ret = IRB.CreateInsertValue(Ret, NewField, I); 464 } 465 } else { 466 for (auto [Idx, FromPart, ToPart] : 467 enumerate(From->subtypes(), To->subtypes())) { 468 Value *Field = IRB.CreateExtractValue(V, Idx); 469 Value *NewField = 470 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 471 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 472 } 473 } 474 ConvertedForStore[V] = Ret; 475 return Ret; 476 } 477 478 Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, 479 const Twine &Name) { 480 if (From == To) 481 return V; 482 if (isBufferFatPtrOrVector(To)) { 483 Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); 484 return Cast; 485 } 486 if (From->getNumContainedTypes() == 0) 487 return V; 488 // Structs, arrays, and other compound types. 489 Value *Ret = PoisonValue::get(To); 490 if (auto *AT = dyn_cast<ArrayType>(From)) { 491 Type *FromPart = AT->getArrayElementType(); 492 Type *ToPart = cast<ArrayType>(To)->getElementType(); 493 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 494 Value *Field = IRB.CreateExtractValue(V, I); 495 Value *NewField = 496 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); 497 Ret = IRB.CreateInsertValue(Ret, NewField, I); 498 } 499 } else { 500 for (auto [Idx, FromPart, ToPart] : 501 enumerate(From->subtypes(), To->subtypes())) { 502 Value *Field = IRB.CreateExtractValue(V, Idx); 503 Value *NewField = 504 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 505 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 506 } 507 } 508 return Ret; 509 } 510 511 bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { 512 bool Changed = false; 513 // The visitors will mutate GEPs and allocas, but will push loads and stores 514 // to the worklist to avoid invalidation. 515 for (Instruction &I : make_early_inc_range(instructions(F))) { 516 Changed |= visit(I); 517 } 518 ConvertedForStore.clear(); 519 return Changed; 520 } 521 522 bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { 523 Type *Ty = I.getAllocatedType(); 524 Type *NewTy = TypeMap->remapType(Ty); 525 if (Ty == NewTy) 526 return false; 527 I.setAllocatedType(NewTy); 528 return true; 529 } 530 531 bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { 532 Type *Ty = I.getSourceElementType(); 533 Type *NewTy = TypeMap->remapType(Ty); 534 if (Ty == NewTy) 535 return false; 536 // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so 537 // make sure GEPs don't have different semantics with the new type. 538 I.setSourceElementType(NewTy); 539 I.setResultElementType(TypeMap->remapType(I.getResultElementType())); 540 return true; 541 } 542 543 bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { 544 Type *Ty = LI.getType(); 545 Type *IntTy = TypeMap->remapType(Ty); 546 if (Ty == IntTy) 547 return false; 548 549 IRB.SetInsertPoint(&LI); 550 auto *NLI = cast<LoadInst>(LI.clone()); 551 NLI->mutateType(IntTy); 552 NLI = IRB.Insert(NLI); 553 copyMetadataForLoad(*NLI, LI); 554 NLI->takeName(&LI); 555 556 Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); 557 LI.replaceAllUsesWith(CastBack); 558 LI.eraseFromParent(); 559 return true; 560 } 561 562 bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { 563 Value *V = SI.getValueOperand(); 564 Type *Ty = V->getType(); 565 Type *IntTy = TypeMap->remapType(Ty); 566 if (Ty == IntTy) 567 return false; 568 569 IRB.SetInsertPoint(&SI); 570 Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); 571 for (auto *Dbg : at::getAssignmentMarkers(&SI)) 572 Dbg->setValue(IntV); 573 574 SI.setOperand(0, IntV); 575 return true; 576 } 577 578 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered 579 /// buffer fat pointer constant. 580 static std::pair<Constant *, Constant *> 581 splitLoweredFatBufferConst(Constant *C) { 582 if (auto *AZ = dyn_cast<ConstantAggregateZero>(C)) 583 return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1)); 584 if (auto *SC = dyn_cast<ConstantStruct>(C)) 585 return std::make_pair(SC->getOperand(0), SC->getOperand(1)); 586 llvm_unreachable("Conversion should've created a {p8, i32} struct"); 587 } 588 589 namespace { 590 /// Handle the remapping of ptr addrspace(7) constants. 591 class FatPtrConstMaterializer final : public ValueMaterializer { 592 BufferFatPtrToStructTypeMap *TypeMap; 593 BufferFatPtrToIntTypeMap *IntTypeMap; 594 // An internal mapper that is used to recurse into the arguments of constants. 595 // While the documentation for `ValueMapper` specifies not to use it 596 // recursively, examination of the logic in mapValue() shows that it can 597 // safely be used recursively when handling constants, like it does in its own 598 // logic. 599 ValueMapper InternalMapper; 600 601 Constant *materializeBufferFatPtrConst(Constant *C); 602 603 const DataLayout &DL; 604 605 public: 606 // UnderlyingMap is the value map this materializer will be filling. 607 FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, 608 ValueToValueMapTy &UnderlyingMap, 609 BufferFatPtrToIntTypeMap *IntTypeMap, 610 const DataLayout &DL) 611 : TypeMap(TypeMap), IntTypeMap(IntTypeMap), 612 InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {} 613 virtual ~FatPtrConstMaterializer() = default; 614 615 Value *materialize(Value *V) override; 616 }; 617 } // namespace 618 619 Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { 620 Type *SrcTy = C->getType(); 621 auto *NewTy = dyn_cast<StructType>(TypeMap->remapType(SrcTy)); 622 if (C->isNullValue()) 623 return ConstantAggregateZero::getNullValue(NewTy); 624 if (isa<PoisonValue>(C)) { 625 return ConstantStruct::get(NewTy, 626 {PoisonValue::get(NewTy->getElementType(0)), 627 PoisonValue::get(NewTy->getElementType(1))}); 628 } 629 if (isa<UndefValue>(C)) { 630 return ConstantStruct::get(NewTy, 631 {UndefValue::get(NewTy->getElementType(0)), 632 UndefValue::get(NewTy->getElementType(1))}); 633 } 634 635 if (isa<GlobalValue>(C)) 636 report_fatal_error("Global values containing ptr addrspace(7) (buffer " 637 "fat pointer) values are not supported"); 638 639 if (auto *VC = dyn_cast<ConstantVector>(C)) { 640 if (Constant *S = VC->getSplatValue()) { 641 Constant *NewS = InternalMapper.mapConstant(*S); 642 if (!NewS) 643 return nullptr; 644 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); 645 auto EC = VC->getType()->getElementCount(); 646 return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), 647 ConstantVector::getSplat(EC, Off)}); 648 } 649 SmallVector<Constant *> Rsrcs; 650 SmallVector<Constant *> Offs; 651 for (Value *Op : VC->operand_values()) { 652 auto *NewOp = dyn_cast_or_null<Constant>(InternalMapper.mapValue(*Op)); 653 if (!NewOp) 654 return nullptr; 655 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); 656 Rsrcs.push_back(Rsrc); 657 Offs.push_back(Off); 658 } 659 Constant *RsrcVec = ConstantVector::get(Rsrcs); 660 Constant *OffVec = ConstantVector::get(Offs); 661 return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); 662 } 663 664 // Constant expressions. This code mirrors how we fix up the equivalent 665 // instructions later. 666 auto *CE = dyn_cast<ConstantExpr>(C); 667 if (!CE) 668 return nullptr; 669 if (auto *GEPO = dyn_cast<GEPOperator>(C)) { 670 Constant *RemappedPtr = 671 InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand())); 672 auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr); 673 Type *OffTy = Off->getType(); 674 bool InBounds = GEPO->isInBounds(); 675 676 MapVector<Value *, APInt> VariableOffs; 677 APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth); 678 if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs, 679 NewConstOffVal)) 680 report_fatal_error( 681 "Scalable vector or unsized struct in fat pointer GEP"); 682 Constant *OffAccum = nullptr; 683 for (auto [Arg, Multiple] : VariableOffs) { 684 Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg)); 685 NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL); 686 if (!Multiple.isOne()) { 687 if (Multiple.isPowerOf2()) { 688 NewArg = ConstantExpr::getShl( 689 NewArg, CE->getIntegerValue(OffTy, APInt(BufferOffsetWidth, 690 Multiple.logBase2()))); 691 } else { 692 NewArg = ConstantExpr::getMul(NewArg, 693 CE->getIntegerValue(OffTy, Multiple)); 694 } 695 } 696 if (OffAccum) { 697 OffAccum = ConstantExpr::getAdd(OffAccum, NewArg); 698 } else { 699 OffAccum = NewArg; 700 } 701 } 702 Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal); 703 if (OffAccum) 704 OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff); 705 else 706 OffAccum = NewConstOff; 707 bool HasNonNegativeOff = false; 708 if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { 709 HasNonNegativeOff = !CI->isNegative(); 710 } 711 Constant *NewOff = ConstantExpr::getAdd( 712 Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff, 713 /*hasNSW=*/false); 714 return ConstantStruct::get(NewTy, {Rsrc, NewOff}); 715 } 716 717 if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) { 718 Constant *Parts = 719 InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand())); 720 auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts); 721 // Here, we take advantage of the fact that ptrtoint has a built-in 722 // zero-extension behavior. 723 unsigned FatPtrWidth = 724 DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 725 Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy); 726 unsigned Width = SrcTy->getScalarSizeInBits(); 727 Constant *Shift = 728 CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth)); 729 Constant *OffCast = 730 ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL); 731 Constant *RsrcHi = ConstantExpr::getShl( 732 RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth); 733 // This should be an or, but those got recently removed. 734 Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true); 735 return Result; 736 } 737 738 if (CE->getOpcode() == Instruction::IntToPtr) { 739 auto *Arg = cast<Constant>(CE->getOperand(0)); 740 unsigned FatPtrWidth = 741 DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 742 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); 743 auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth); 744 Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL); 745 746 Constant *Shift = 747 CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth)); 748 Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth); 749 Type *RsrcTy = NewTy->getElementType(0); 750 Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth); 751 Constant *RsrcInt = CE->getTrunc( 752 ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL), 753 RsrcIntType); 754 Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy); 755 Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL); 756 757 return ConstantStruct::get(NewTy, {Rsrc, Off}); 758 } 759 760 if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) { 761 unsigned SrcAS = AC->getSrcAddressSpace(); 762 unsigned DstAS = AC->getDestAddressSpace(); 763 auto *Arg = cast<Constant>(AC->getPointerOperand()); 764 auto *NewArg = InternalMapper.mapConstant(*Arg); 765 if (!NewArg) 766 return nullptr; 767 if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER && 768 DstAS == AMDGPUAS::BUFFER_FAT_POINTER) 769 return NewArg; 770 if (SrcAS == AMDGPUAS::BUFFER_RESOURCE && 771 DstAS == AMDGPUAS::BUFFER_FAT_POINTER) { 772 auto *NullOff = CE->getNullValue(NewTy->getElementType(1)); 773 return ConstantStruct::get(NewTy, {NewArg, NullOff}); 774 } 775 report_fatal_error( 776 "Unsupported address space cast for a buffer fat pointer"); 777 } 778 return nullptr; 779 } 780 781 Value *FatPtrConstMaterializer::materialize(Value *V) { 782 Constant *C = dyn_cast<Constant>(V); 783 if (!C) 784 return nullptr; 785 if (auto *GEPO = dyn_cast<GEPOperator>(C)) { 786 // As a special case, adjust GEP constants that have a ptr addrspace(7) in 787 // their source types here, since the earlier local changes didn't handle 788 // htis. 789 Type *SrcTy = GEPO->getSourceElementType(); 790 Type *NewSrcTy = IntTypeMap->remapType(SrcTy); 791 if (SrcTy != NewSrcTy) { 792 SmallVector<Constant *> Ops; 793 Ops.reserve(GEPO->getNumOperands()); 794 for (const Use &U : GEPO->operands()) 795 Ops.push_back(cast<Constant>(U.get())); 796 auto *NewGEP = ConstantExpr::getGetElementPtr( 797 NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1), 798 GEPO->getNoWrapFlags(), GEPO->getInRange()); 799 LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP 800 << "\n"); 801 Value *FurtherMap = materialize(NewGEP); 802 return FurtherMap ? FurtherMap : NewGEP; 803 } 804 } 805 // Structs and other types that happen to contain fat pointers get remapped 806 // by the mapValue() logic. 807 if (!isBufferFatPtrConst(C)) 808 return nullptr; 809 return materializeBufferFatPtrConst(C); 810 } 811 812 using PtrParts = std::pair<Value *, Value *>; 813 namespace { 814 // The visitor returns the resource and offset parts for an instruction if they 815 // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful 816 // value mapping. 817 class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> { 818 ValueToValueMapTy RsrcParts; 819 ValueToValueMapTy OffParts; 820 821 // Track instructions that have been rewritten into a user of the component 822 // parts of their ptr addrspace(7) input. Instructions that produced 823 // ptr addrspace(7) parts should **not** be RAUW'd before being added to this 824 // set, as that replacement will be handled in a post-visit step. However, 825 // instructions that yield values that aren't fat pointers (ex. ptrtoint) 826 // should RAUW themselves with new instructions that use the split parts 827 // of their arguments during processing. 828 DenseSet<Instruction *> SplitUsers; 829 830 // Nodes that need a second look once we've computed the parts for all other 831 // instructions to see if, for example, we really need to phi on the resource 832 // part. 833 SmallVector<Instruction *> Conditionals; 834 // Temporary instructions produced while lowering conditionals that should be 835 // killed. 836 SmallVector<Instruction *> ConditionalTemps; 837 838 // Subtarget info, needed for determining what cache control bits to set. 839 const TargetMachine *TM; 840 const GCNSubtarget *ST; 841 842 IRBuilder<> IRB; 843 844 // Copy metadata between instructions if applicable. 845 void copyMetadata(Value *Dest, Value *Src); 846 847 // Get the resource and offset parts of the value V, inserting appropriate 848 // extractvalue calls if needed. 849 PtrParts getPtrParts(Value *V); 850 851 // Given an instruction that could produce multiple resource parts (a PHI or 852 // select), collect the set of possible instructions that could have provided 853 // its resource parts that it could have (the `Roots`) and the set of 854 // conditional instructions visited during the search (`Seen`). If, after 855 // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset 856 // of `Roots` and `Roots - Seen` contains one element, the resource part of 857 // that element can replace the resource part of all other elements in `Seen`. 858 void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots, 859 SmallPtrSetImpl<Value *> &Seen); 860 void processConditionals(); 861 862 // If an instruction hav been split into resource and offset parts, 863 // delete that instruction. If any of its uses have not themselves been split 864 // into parts (for example, an insertvalue), construct the structure 865 // that the type rewrites declared should be produced by the dying instruction 866 // and use that. 867 // Also, kill the temporary extractvalue operations produced by the two-stage 868 // lowering of PHIs and conditionals. 869 void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs); 870 871 void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx); 872 void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 873 void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 874 Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, 875 Align Alignment, AtomicOrdering Order, 876 bool IsVolatile, SyncScope::ID SSID); 877 878 public: 879 SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) 880 : TM(TM), ST(nullptr), IRB(Ctx) {} 881 882 void processFunction(Function &F); 883 884 PtrParts visitInstruction(Instruction &I); 885 PtrParts visitLoadInst(LoadInst &LI); 886 PtrParts visitStoreInst(StoreInst &SI); 887 PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); 888 PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); 889 PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); 890 891 PtrParts visitPtrToIntInst(PtrToIntInst &PI); 892 PtrParts visitIntToPtrInst(IntToPtrInst &IP); 893 PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); 894 PtrParts visitICmpInst(ICmpInst &Cmp); 895 PtrParts visitFreezeInst(FreezeInst &I); 896 897 PtrParts visitExtractElementInst(ExtractElementInst &I); 898 PtrParts visitInsertElementInst(InsertElementInst &I); 899 PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); 900 901 PtrParts visitPHINode(PHINode &PHI); 902 PtrParts visitSelectInst(SelectInst &SI); 903 904 PtrParts visitIntrinsicInst(IntrinsicInst &II); 905 }; 906 } // namespace 907 908 void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { 909 auto *DestI = dyn_cast<Instruction>(Dest); 910 auto *SrcI = dyn_cast<Instruction>(Src); 911 912 if (!DestI || !SrcI) 913 return; 914 915 DestI->copyMetadata(*SrcI); 916 } 917 918 PtrParts SplitPtrStructs::getPtrParts(Value *V) { 919 assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " 920 "of something that wasn't rewritten"); 921 auto *RsrcEntry = &RsrcParts[V]; 922 auto *OffEntry = &OffParts[V]; 923 if (*RsrcEntry && *OffEntry) 924 return {*RsrcEntry, *OffEntry}; 925 926 if (auto *C = dyn_cast<Constant>(V)) { 927 auto [Rsrc, Off] = splitLoweredFatBufferConst(C); 928 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 929 } 930 931 IRBuilder<>::InsertPointGuard Guard(IRB); 932 if (auto *I = dyn_cast<Instruction>(V)) { 933 LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); 934 auto [Rsrc, Off] = visit(*I); 935 if (Rsrc && Off) 936 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 937 // We'll be creating the new values after the relevant instruction. 938 // This instruction generates a value and so isn't a terminator. 939 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 940 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 941 } else if (auto *A = dyn_cast<Argument>(V)) { 942 IRB.SetInsertPointPastAllocas(A->getParent()); 943 IRB.SetCurrentDebugLocation(DebugLoc()); 944 } 945 Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); 946 Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); 947 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 948 } 949 950 /// Returns the instruction that defines the resource part of the value V. 951 /// Note that this is not getUnderlyingObject(), since that looks through 952 /// operations like ptrmask which might modify the resource part. 953 /// 954 /// We can limit ourselves to just looking through GEPs followed by looking 955 /// through addrspacecasts because only those two operations preserve the 956 /// resource part, and because operations on an `addrspace(8)` (which is the 957 /// legal input to this addrspacecast) would produce a different resource part. 958 static Value *rsrcPartRoot(Value *V) { 959 while (auto *GEP = dyn_cast<GEPOperator>(V)) 960 V = GEP->getPointerOperand(); 961 while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(V)) 962 V = ASC->getPointerOperand(); 963 return V; 964 } 965 966 void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, 967 SmallPtrSetImpl<Value *> &Roots, 968 SmallPtrSetImpl<Value *> &Seen) { 969 if (auto *PHI = dyn_cast<PHINode>(I)) { 970 if (!Seen.insert(I).second) 971 return; 972 for (Value *In : PHI->incoming_values()) { 973 In = rsrcPartRoot(In); 974 Roots.insert(In); 975 if (isa<PHINode, SelectInst>(In)) 976 getPossibleRsrcRoots(cast<Instruction>(In), Roots, Seen); 977 } 978 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 979 if (!Seen.insert(SI).second) 980 return; 981 Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); 982 Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); 983 Roots.insert(TrueVal); 984 Roots.insert(FalseVal); 985 if (isa<PHINode, SelectInst>(TrueVal)) 986 getPossibleRsrcRoots(cast<Instruction>(TrueVal), Roots, Seen); 987 if (isa<PHINode, SelectInst>(FalseVal)) 988 getPossibleRsrcRoots(cast<Instruction>(FalseVal), Roots, Seen); 989 } else { 990 llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); 991 } 992 } 993 994 void SplitPtrStructs::processConditionals() { 995 SmallDenseMap<Instruction *, Value *> FoundRsrcs; 996 SmallPtrSet<Value *, 4> Roots; 997 SmallPtrSet<Value *, 4> Seen; 998 for (Instruction *I : Conditionals) { 999 // These have to exist by now because we've visited these nodes. 1000 Value *Rsrc = RsrcParts[I]; 1001 Value *Off = OffParts[I]; 1002 assert(Rsrc && Off && "must have visited conditionals by now"); 1003 1004 std::optional<Value *> MaybeRsrc; 1005 auto MaybeFoundRsrc = FoundRsrcs.find(I); 1006 if (MaybeFoundRsrc != FoundRsrcs.end()) { 1007 MaybeRsrc = MaybeFoundRsrc->second; 1008 } else { 1009 IRBuilder<>::InsertPointGuard Guard(IRB); 1010 Roots.clear(); 1011 Seen.clear(); 1012 getPossibleRsrcRoots(I, Roots, Seen); 1013 LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n"); 1014 #ifndef NDEBUG 1015 for (Value *V : Roots) 1016 LLVM_DEBUG(dbgs() << "Root: " << *V << "\n"); 1017 for (Value *V : Seen) 1018 LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n"); 1019 #endif 1020 // If we are our own possible root, then we shouldn't block our 1021 // replacement with a valid incoming value. 1022 Roots.erase(I); 1023 // We don't want to block the optimization for conditionals that don't 1024 // refer to themselves but did see themselves during the traversal. 1025 Seen.erase(I); 1026 1027 if (set_is_subset(Seen, Roots)) { 1028 auto Diff = set_difference(Roots, Seen); 1029 if (Diff.size() == 1) { 1030 Value *RootVal = *Diff.begin(); 1031 // Handle the case where previous loops already looked through 1032 // an addrspacecast. 1033 if (isSplitFatPtr(RootVal->getType())) 1034 MaybeRsrc = std::get<0>(getPtrParts(RootVal)); 1035 else 1036 MaybeRsrc = RootVal; 1037 } 1038 } 1039 } 1040 1041 if (auto *PHI = dyn_cast<PHINode>(I)) { 1042 Value *NewRsrc; 1043 StructType *PHITy = cast<StructType>(PHI->getType()); 1044 IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef()); 1045 IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); 1046 if (MaybeRsrc) { 1047 NewRsrc = *MaybeRsrc; 1048 } else { 1049 Type *RsrcTy = PHITy->getElementType(0); 1050 auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); 1051 RsrcPHI->takeName(Rsrc); 1052 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 1053 Value *VRsrc = std::get<0>(getPtrParts(V)); 1054 RsrcPHI->addIncoming(VRsrc, BB); 1055 } 1056 copyMetadata(RsrcPHI, PHI); 1057 NewRsrc = RsrcPHI; 1058 } 1059 1060 Type *OffTy = PHITy->getElementType(1); 1061 auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); 1062 NewOff->takeName(Off); 1063 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 1064 assert(OffParts.count(V) && "An offset part had to be created by now"); 1065 Value *VOff = std::get<1>(getPtrParts(V)); 1066 NewOff->addIncoming(VOff, BB); 1067 } 1068 copyMetadata(NewOff, PHI); 1069 1070 // Note: We don't eraseFromParent() the temporaries because we don't want 1071 // to put the corrections maps in an inconstent state. That'll be handed 1072 // during the rest of the killing. Also, `ValueToValueMapTy` guarantees 1073 // that references in that map will be updated as well. 1074 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 1075 ConditionalTemps.push_back(cast<Instruction>(Off)); 1076 Rsrc->replaceAllUsesWith(NewRsrc); 1077 Off->replaceAllUsesWith(NewOff); 1078 1079 // Save on recomputing the cycle traversals in known-root cases. 1080 if (MaybeRsrc) 1081 for (Value *V : Seen) 1082 FoundRsrcs[cast<Instruction>(V)] = NewRsrc; 1083 } else if (isa<SelectInst>(I)) { 1084 if (MaybeRsrc) { 1085 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 1086 Rsrc->replaceAllUsesWith(*MaybeRsrc); 1087 for (Value *V : Seen) 1088 FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc; 1089 } 1090 } else { 1091 llvm_unreachable("Only PHIs and selects go in the conditionals list"); 1092 } 1093 } 1094 } 1095 1096 void SplitPtrStructs::killAndReplaceSplitInstructions( 1097 SmallVectorImpl<Instruction *> &Origs) { 1098 for (Instruction *I : ConditionalTemps) 1099 I->eraseFromParent(); 1100 1101 for (Instruction *I : Origs) { 1102 if (!SplitUsers.contains(I)) 1103 continue; 1104 1105 SmallVector<DbgValueInst *> Dbgs; 1106 findDbgValues(Dbgs, I); 1107 for (auto *Dbg : Dbgs) { 1108 IRB.SetInsertPoint(Dbg); 1109 auto &DL = I->getModule()->getDataLayout(); 1110 assert(isSplitFatPtr(I->getType()) && 1111 "We should've RAUW'd away loads, stores, etc. at this point"); 1112 auto *OffDbg = cast<DbgValueInst>(Dbg->clone()); 1113 copyMetadata(OffDbg, Dbg); 1114 auto [Rsrc, Off] = getPtrParts(I); 1115 1116 int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); 1117 int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); 1118 1119 std::optional<DIExpression *> RsrcExpr = 1120 DIExpression::createFragmentExpression(Dbg->getExpression(), 0, 1121 RsrcSz); 1122 std::optional<DIExpression *> OffExpr = 1123 DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, 1124 OffSz); 1125 if (OffExpr) { 1126 OffDbg->setExpression(*OffExpr); 1127 OffDbg->replaceVariableLocationOp(I, Off); 1128 IRB.Insert(OffDbg); 1129 } else { 1130 OffDbg->deleteValue(); 1131 } 1132 if (RsrcExpr) { 1133 Dbg->setExpression(*RsrcExpr); 1134 Dbg->replaceVariableLocationOp(I, Rsrc); 1135 } else { 1136 Dbg->replaceVariableLocationOp(I, UndefValue::get(I->getType())); 1137 } 1138 } 1139 1140 Value *Poison = PoisonValue::get(I->getType()); 1141 I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { 1142 if (const auto *UI = dyn_cast<Instruction>(U.getUser())) 1143 return SplitUsers.contains(UI); 1144 return false; 1145 }); 1146 1147 if (I->use_empty()) { 1148 I->eraseFromParent(); 1149 continue; 1150 } 1151 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 1152 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 1153 auto [Rsrc, Off] = getPtrParts(I); 1154 Value *Struct = PoisonValue::get(I->getType()); 1155 Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); 1156 Struct = IRB.CreateInsertValue(Struct, Off, 1); 1157 copyMetadata(Struct, I); 1158 Struct->takeName(I); 1159 I->replaceAllUsesWith(Struct); 1160 I->eraseFromParent(); 1161 } 1162 } 1163 1164 void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { 1165 LLVMContext &Ctx = Intr->getContext(); 1166 Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A)); 1167 } 1168 1169 void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, 1170 SyncScope::ID SSID) { 1171 switch (Order) { 1172 case AtomicOrdering::Release: 1173 case AtomicOrdering::AcquireRelease: 1174 case AtomicOrdering::SequentiallyConsistent: 1175 IRB.CreateFence(AtomicOrdering::Release, SSID); 1176 break; 1177 default: 1178 break; 1179 } 1180 } 1181 1182 void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, 1183 SyncScope::ID SSID) { 1184 switch (Order) { 1185 case AtomicOrdering::Acquire: 1186 case AtomicOrdering::AcquireRelease: 1187 case AtomicOrdering::SequentiallyConsistent: 1188 IRB.CreateFence(AtomicOrdering::Acquire, SSID); 1189 break; 1190 default: 1191 break; 1192 } 1193 } 1194 1195 Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, 1196 Type *Ty, Align Alignment, 1197 AtomicOrdering Order, bool IsVolatile, 1198 SyncScope::ID SSID) { 1199 IRB.SetInsertPoint(I); 1200 1201 auto [Rsrc, Off] = getPtrParts(Ptr); 1202 SmallVector<Value *, 5> Args; 1203 if (Arg) 1204 Args.push_back(Arg); 1205 Args.push_back(Rsrc); 1206 Args.push_back(Off); 1207 insertPreMemOpFence(Order, SSID); 1208 // soffset is always 0 for these cases, where we always want any offset to be 1209 // part of bounds checking and we don't know which parts of the GEPs is 1210 // uniform. 1211 Args.push_back(IRB.getInt32(0)); 1212 1213 uint32_t Aux = 0; 1214 bool IsInvariant = 1215 (isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load)); 1216 bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal); 1217 // Atomic loads and stores need glc, atomic read-modify-write doesn't. 1218 bool IsOneWayAtomic = 1219 !isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic; 1220 if (IsOneWayAtomic) 1221 Aux |= AMDGPU::CPol::GLC; 1222 if (IsNonTemporal && !IsInvariant) 1223 Aux |= AMDGPU::CPol::SLC; 1224 if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10) 1225 Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0); 1226 if (IsVolatile) 1227 Aux |= AMDGPU::CPol::VOLATILE; 1228 Args.push_back(IRB.getInt32(Aux)); 1229 1230 Intrinsic::ID IID = Intrinsic::not_intrinsic; 1231 if (isa<LoadInst>(I)) 1232 // TODO: Do we need to do something about atomic loads? 1233 IID = Intrinsic::amdgcn_raw_ptr_buffer_load; 1234 else if (isa<StoreInst>(I)) 1235 IID = Intrinsic::amdgcn_raw_ptr_buffer_store; 1236 else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) { 1237 switch (RMW->getOperation()) { 1238 case AtomicRMWInst::Xchg: 1239 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; 1240 break; 1241 case AtomicRMWInst::Add: 1242 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; 1243 break; 1244 case AtomicRMWInst::Sub: 1245 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; 1246 break; 1247 case AtomicRMWInst::And: 1248 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; 1249 break; 1250 case AtomicRMWInst::Or: 1251 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; 1252 break; 1253 case AtomicRMWInst::Xor: 1254 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; 1255 break; 1256 case AtomicRMWInst::Max: 1257 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; 1258 break; 1259 case AtomicRMWInst::Min: 1260 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; 1261 break; 1262 case AtomicRMWInst::UMax: 1263 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; 1264 break; 1265 case AtomicRMWInst::UMin: 1266 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; 1267 break; 1268 case AtomicRMWInst::FAdd: 1269 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; 1270 break; 1271 case AtomicRMWInst::FMax: 1272 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; 1273 break; 1274 case AtomicRMWInst::FMin: 1275 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; 1276 break; 1277 case AtomicRMWInst::FSub: { 1278 report_fatal_error("atomic floating point subtraction not supported for " 1279 "buffer resources and should've been expanded away"); 1280 break; 1281 } 1282 case AtomicRMWInst::Nand: 1283 report_fatal_error("atomic nand not supported for buffer resources and " 1284 "should've been expanded away"); 1285 break; 1286 case AtomicRMWInst::UIncWrap: 1287 case AtomicRMWInst::UDecWrap: 1288 report_fatal_error("wrapping increment/decrement not supported for " 1289 "buffer resources and should've ben expanded away"); 1290 break; 1291 case AtomicRMWInst::BAD_BINOP: 1292 llvm_unreachable("Not sure how we got a bad binop"); 1293 } 1294 } 1295 1296 auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); 1297 copyMetadata(Call, I); 1298 setAlign(Call, Alignment, Arg ? 1 : 0); 1299 Call->takeName(I); 1300 1301 insertPostMemOpFence(Order, SSID); 1302 // The "no moving p7 directly" rewrites ensure that this load or store won't 1303 // itself need to be split into parts. 1304 SplitUsers.insert(I); 1305 I->replaceAllUsesWith(Call); 1306 return Call; 1307 } 1308 1309 PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { 1310 return {nullptr, nullptr}; 1311 } 1312 1313 PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { 1314 if (!isSplitFatPtr(LI.getPointerOperandType())) 1315 return {nullptr, nullptr}; 1316 handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), 1317 LI.getAlign(), LI.getOrdering(), LI.isVolatile(), 1318 LI.getSyncScopeID()); 1319 return {nullptr, nullptr}; 1320 } 1321 1322 PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { 1323 if (!isSplitFatPtr(SI.getPointerOperandType())) 1324 return {nullptr, nullptr}; 1325 Value *Arg = SI.getValueOperand(); 1326 handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), 1327 SI.getAlign(), SI.getOrdering(), SI.isVolatile(), 1328 SI.getSyncScopeID()); 1329 return {nullptr, nullptr}; 1330 } 1331 1332 PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { 1333 if (!isSplitFatPtr(AI.getPointerOperand()->getType())) 1334 return {nullptr, nullptr}; 1335 Value *Arg = AI.getValOperand(); 1336 handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), 1337 AI.getAlign(), AI.getOrdering(), AI.isVolatile(), 1338 AI.getSyncScopeID()); 1339 return {nullptr, nullptr}; 1340 } 1341 1342 // Unlike load, store, and RMW, cmpxchg needs special handling to account 1343 // for the boolean argument. 1344 PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { 1345 Value *Ptr = AI.getPointerOperand(); 1346 if (!isSplitFatPtr(Ptr->getType())) 1347 return {nullptr, nullptr}; 1348 IRB.SetInsertPoint(&AI); 1349 1350 Type *Ty = AI.getNewValOperand()->getType(); 1351 AtomicOrdering Order = AI.getMergedOrdering(); 1352 SyncScope::ID SSID = AI.getSyncScopeID(); 1353 bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); 1354 1355 auto [Rsrc, Off] = getPtrParts(Ptr); 1356 insertPreMemOpFence(Order, SSID); 1357 1358 uint32_t Aux = 0; 1359 if (IsNonTemporal) 1360 Aux |= AMDGPU::CPol::SLC; 1361 if (AI.isVolatile()) 1362 Aux |= AMDGPU::CPol::VOLATILE; 1363 auto *Call = 1364 IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, 1365 {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, 1366 Off, IRB.getInt32(0), IRB.getInt32(Aux)}); 1367 copyMetadata(Call, &AI); 1368 setAlign(Call, AI.getAlign(), 2); 1369 Call->takeName(&AI); 1370 insertPostMemOpFence(Order, SSID); 1371 1372 Value *Res = PoisonValue::get(AI.getType()); 1373 Res = IRB.CreateInsertValue(Res, Call, 0); 1374 if (!AI.isWeak()) { 1375 Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); 1376 Res = IRB.CreateInsertValue(Res, Succeeded, 1); 1377 } 1378 SplitUsers.insert(&AI); 1379 AI.replaceAllUsesWith(Res); 1380 return {nullptr, nullptr}; 1381 } 1382 1383 PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { 1384 using namespace llvm::PatternMatch; 1385 Value *Ptr = GEP.getPointerOperand(); 1386 if (!isSplitFatPtr(Ptr->getType())) 1387 return {nullptr, nullptr}; 1388 IRB.SetInsertPoint(&GEP); 1389 1390 auto [Rsrc, Off] = getPtrParts(Ptr); 1391 const DataLayout &DL = GEP.getModule()->getDataLayout(); 1392 bool InBounds = GEP.isInBounds(); 1393 1394 // In order to call emitGEPOffset() and thus not have to reimplement it, 1395 // we need the GEP result to have ptr addrspace(7) type. 1396 Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); 1397 if (auto *VT = dyn_cast<VectorType>(Off->getType())) 1398 FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); 1399 GEP.mutateType(FatPtrTy); 1400 Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); 1401 GEP.mutateType(Ptr->getType()); 1402 if (match(OffAccum, m_Zero())) { // Constant-zero offset 1403 SplitUsers.insert(&GEP); 1404 return {Rsrc, Off}; 1405 } 1406 1407 bool HasNonNegativeOff = false; 1408 if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { 1409 HasNonNegativeOff = !CI->isNegative(); 1410 } 1411 Value *NewOff; 1412 if (match(Off, m_Zero())) { 1413 NewOff = OffAccum; 1414 } else { 1415 NewOff = IRB.CreateAdd(Off, OffAccum, "", 1416 /*hasNUW=*/InBounds && HasNonNegativeOff, 1417 /*hasNSW=*/false); 1418 } 1419 copyMetadata(NewOff, &GEP); 1420 NewOff->takeName(&GEP); 1421 SplitUsers.insert(&GEP); 1422 return {Rsrc, NewOff}; 1423 } 1424 1425 PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { 1426 Value *Ptr = PI.getPointerOperand(); 1427 if (!isSplitFatPtr(Ptr->getType())) 1428 return {nullptr, nullptr}; 1429 IRB.SetInsertPoint(&PI); 1430 1431 Type *ResTy = PI.getType(); 1432 unsigned Width = ResTy->getScalarSizeInBits(); 1433 1434 auto [Rsrc, Off] = getPtrParts(Ptr); 1435 const DataLayout &DL = PI.getModule()->getDataLayout(); 1436 unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 1437 1438 Value *RsrcInt; 1439 if (Width <= BufferOffsetWidth) 1440 RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width)); 1441 else 1442 RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); 1443 copyMetadata(RsrcInt, &PI); 1444 1445 Value *Shl = IRB.CreateShl( 1446 RsrcInt, 1447 ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", 1448 Width >= FatPtrWidth, Width > FatPtrWidth); 1449 Value *OffCast = 1450 IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); 1451 Value *Res = IRB.CreateOr(Shl, OffCast); 1452 Res->takeName(&PI); 1453 SplitUsers.insert(&PI); 1454 PI.replaceAllUsesWith(Res); 1455 return {nullptr, nullptr}; 1456 } 1457 1458 PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { 1459 if (!isSplitFatPtr(IP.getType())) 1460 return {nullptr, nullptr}; 1461 IRB.SetInsertPoint(&IP); 1462 const DataLayout &DL = IP.getModule()->getDataLayout(); 1463 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); 1464 Value *Int = IP.getOperand(0); 1465 Type *IntTy = Int->getType(); 1466 Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); 1467 unsigned Width = IntTy->getScalarSizeInBits(); 1468 1469 auto *RetTy = cast<StructType>(IP.getType()); 1470 Type *RsrcTy = RetTy->getElementType(0); 1471 Type *OffTy = RetTy->getElementType(1); 1472 Value *RsrcPart = IRB.CreateLShr( 1473 Int, 1474 ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); 1475 Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); 1476 Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); 1477 Value *Off = 1478 IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); 1479 1480 copyMetadata(Rsrc, &IP); 1481 SplitUsers.insert(&IP); 1482 return {Rsrc, Off}; 1483 } 1484 1485 PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { 1486 if (!isSplitFatPtr(I.getType())) 1487 return {nullptr, nullptr}; 1488 IRB.SetInsertPoint(&I); 1489 Value *In = I.getPointerOperand(); 1490 // No-op casts preserve parts 1491 if (In->getType() == I.getType()) { 1492 auto [Rsrc, Off] = getPtrParts(In); 1493 SplitUsers.insert(&I); 1494 return {Rsrc, Off}; 1495 } 1496 if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) 1497 report_fatal_error("Only buffer resources (addrspace 8) can be cast to " 1498 "buffer fat pointers (addrspace 7)"); 1499 Type *OffTy = cast<StructType>(I.getType())->getElementType(1); 1500 Value *ZeroOff = Constant::getNullValue(OffTy); 1501 SplitUsers.insert(&I); 1502 return {In, ZeroOff}; 1503 } 1504 1505 PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { 1506 Value *Lhs = Cmp.getOperand(0); 1507 if (!isSplitFatPtr(Lhs->getType())) 1508 return {nullptr, nullptr}; 1509 Value *Rhs = Cmp.getOperand(1); 1510 IRB.SetInsertPoint(&Cmp); 1511 ICmpInst::Predicate Pred = Cmp.getPredicate(); 1512 1513 assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && 1514 "Pointer comparison is only equal or unequal"); 1515 auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); 1516 auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); 1517 Value *RsrcCmp = 1518 IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); 1519 copyMetadata(RsrcCmp, &Cmp); 1520 Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); 1521 copyMetadata(OffCmp, &Cmp); 1522 1523 Value *Res = nullptr; 1524 if (Pred == ICmpInst::ICMP_EQ) 1525 Res = IRB.CreateAnd(RsrcCmp, OffCmp); 1526 else if (Pred == ICmpInst::ICMP_NE) 1527 Res = IRB.CreateOr(RsrcCmp, OffCmp); 1528 copyMetadata(Res, &Cmp); 1529 Res->takeName(&Cmp); 1530 SplitUsers.insert(&Cmp); 1531 Cmp.replaceAllUsesWith(Res); 1532 return {nullptr, nullptr}; 1533 } 1534 1535 PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { 1536 if (!isSplitFatPtr(I.getType())) 1537 return {nullptr, nullptr}; 1538 IRB.SetInsertPoint(&I); 1539 auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); 1540 1541 Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); 1542 copyMetadata(RsrcRes, &I); 1543 Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); 1544 copyMetadata(OffRes, &I); 1545 SplitUsers.insert(&I); 1546 return {RsrcRes, OffRes}; 1547 } 1548 1549 PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { 1550 if (!isSplitFatPtr(I.getType())) 1551 return {nullptr, nullptr}; 1552 IRB.SetInsertPoint(&I); 1553 Value *Vec = I.getVectorOperand(); 1554 Value *Idx = I.getIndexOperand(); 1555 auto [Rsrc, Off] = getPtrParts(Vec); 1556 1557 Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); 1558 copyMetadata(RsrcRes, &I); 1559 Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); 1560 copyMetadata(OffRes, &I); 1561 SplitUsers.insert(&I); 1562 return {RsrcRes, OffRes}; 1563 } 1564 1565 PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { 1566 // The mutated instructions temporarily don't return vectors, and so 1567 // we need the generic getType() here to avoid crashes. 1568 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1569 return {nullptr, nullptr}; 1570 IRB.SetInsertPoint(&I); 1571 Value *Vec = I.getOperand(0); 1572 Value *Elem = I.getOperand(1); 1573 Value *Idx = I.getOperand(2); 1574 auto [VecRsrc, VecOff] = getPtrParts(Vec); 1575 auto [ElemRsrc, ElemOff] = getPtrParts(Elem); 1576 1577 Value *RsrcRes = 1578 IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); 1579 copyMetadata(RsrcRes, &I); 1580 Value *OffRes = 1581 IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); 1582 copyMetadata(OffRes, &I); 1583 SplitUsers.insert(&I); 1584 return {RsrcRes, OffRes}; 1585 } 1586 1587 PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { 1588 // Cast is needed for the same reason as insertelement's. 1589 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1590 return {nullptr, nullptr}; 1591 IRB.SetInsertPoint(&I); 1592 1593 Value *V1 = I.getOperand(0); 1594 Value *V2 = I.getOperand(1); 1595 ArrayRef<int> Mask = I.getShuffleMask(); 1596 auto [V1Rsrc, V1Off] = getPtrParts(V1); 1597 auto [V2Rsrc, V2Off] = getPtrParts(V2); 1598 1599 Value *RsrcRes = 1600 IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); 1601 copyMetadata(RsrcRes, &I); 1602 Value *OffRes = 1603 IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); 1604 copyMetadata(OffRes, &I); 1605 SplitUsers.insert(&I); 1606 return {RsrcRes, OffRes}; 1607 } 1608 1609 PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { 1610 if (!isSplitFatPtr(PHI.getType())) 1611 return {nullptr, nullptr}; 1612 IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef()); 1613 // Phi nodes will be handled in post-processing after we've visited every 1614 // instruction. However, instead of just returning {nullptr, nullptr}, 1615 // we explicitly create the temporary extractvalue operations that are our 1616 // temporary results so that they end up at the beginning of the block with 1617 // the PHIs. 1618 Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); 1619 Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); 1620 Conditionals.push_back(&PHI); 1621 SplitUsers.insert(&PHI); 1622 return {TmpRsrc, TmpOff}; 1623 } 1624 1625 PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { 1626 if (!isSplitFatPtr(SI.getType())) 1627 return {nullptr, nullptr}; 1628 IRB.SetInsertPoint(&SI); 1629 1630 Value *Cond = SI.getCondition(); 1631 Value *True = SI.getTrueValue(); 1632 Value *False = SI.getFalseValue(); 1633 auto [TrueRsrc, TrueOff] = getPtrParts(True); 1634 auto [FalseRsrc, FalseOff] = getPtrParts(False); 1635 1636 Value *RsrcRes = 1637 IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); 1638 copyMetadata(RsrcRes, &SI); 1639 Conditionals.push_back(&SI); 1640 Value *OffRes = 1641 IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); 1642 copyMetadata(OffRes, &SI); 1643 SplitUsers.insert(&SI); 1644 return {RsrcRes, OffRes}; 1645 } 1646 1647 /// Returns true if this intrinsic needs to be removed when it is 1648 /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are 1649 /// rewritten into calls to versions of that intrinsic on the resource 1650 /// descriptor. 1651 static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { 1652 switch (IID) { 1653 default: 1654 return false; 1655 case Intrinsic::ptrmask: 1656 case Intrinsic::invariant_start: 1657 case Intrinsic::invariant_end: 1658 case Intrinsic::launder_invariant_group: 1659 case Intrinsic::strip_invariant_group: 1660 return true; 1661 } 1662 } 1663 1664 PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { 1665 Intrinsic::ID IID = I.getIntrinsicID(); 1666 switch (IID) { 1667 default: 1668 break; 1669 case Intrinsic::ptrmask: { 1670 Value *Ptr = I.getArgOperand(0); 1671 if (!isSplitFatPtr(Ptr->getType())) 1672 return {nullptr, nullptr}; 1673 Value *Mask = I.getArgOperand(1); 1674 IRB.SetInsertPoint(&I); 1675 auto [Rsrc, Off] = getPtrParts(Ptr); 1676 if (Mask->getType() != Off->getType()) 1677 report_fatal_error("offset width is not equal to index width of fat " 1678 "pointer (data layout not set up correctly?)"); 1679 Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off"); 1680 copyMetadata(OffRes, &I); 1681 SplitUsers.insert(&I); 1682 return {Rsrc, OffRes}; 1683 } 1684 // Pointer annotation intrinsics that, given their object-wide nature 1685 // operate on the resource part. 1686 case Intrinsic::invariant_start: { 1687 Value *Ptr = I.getArgOperand(1); 1688 if (!isSplitFatPtr(Ptr->getType())) 1689 return {nullptr, nullptr}; 1690 IRB.SetInsertPoint(&I); 1691 auto [Rsrc, Off] = getPtrParts(Ptr); 1692 Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); 1693 auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); 1694 copyMetadata(NewRsrc, &I); 1695 NewRsrc->takeName(&I); 1696 SplitUsers.insert(&I); 1697 I.replaceAllUsesWith(NewRsrc); 1698 return {nullptr, nullptr}; 1699 } 1700 case Intrinsic::invariant_end: { 1701 Value *RealPtr = I.getArgOperand(2); 1702 if (!isSplitFatPtr(RealPtr->getType())) 1703 return {nullptr, nullptr}; 1704 IRB.SetInsertPoint(&I); 1705 Value *RealRsrc = getPtrParts(RealPtr).first; 1706 Value *InvPtr = I.getArgOperand(0); 1707 Value *Size = I.getArgOperand(1); 1708 Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, 1709 {InvPtr, Size, RealRsrc}); 1710 copyMetadata(NewRsrc, &I); 1711 NewRsrc->takeName(&I); 1712 SplitUsers.insert(&I); 1713 I.replaceAllUsesWith(NewRsrc); 1714 return {nullptr, nullptr}; 1715 } 1716 case Intrinsic::launder_invariant_group: 1717 case Intrinsic::strip_invariant_group: { 1718 Value *Ptr = I.getArgOperand(0); 1719 if (!isSplitFatPtr(Ptr->getType())) 1720 return {nullptr, nullptr}; 1721 IRB.SetInsertPoint(&I); 1722 auto [Rsrc, Off] = getPtrParts(Ptr); 1723 Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); 1724 copyMetadata(NewRsrc, &I); 1725 NewRsrc->takeName(&I); 1726 SplitUsers.insert(&I); 1727 return {NewRsrc, Off}; 1728 } 1729 } 1730 return {nullptr, nullptr}; 1731 } 1732 1733 void SplitPtrStructs::processFunction(Function &F) { 1734 ST = &TM->getSubtarget<GCNSubtarget>(F); 1735 SmallVector<Instruction *, 0> Originals; 1736 LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName() 1737 << "\n"); 1738 for (Instruction &I : instructions(F)) 1739 Originals.push_back(&I); 1740 for (Instruction *I : Originals) { 1741 auto [Rsrc, Off] = visit(I); 1742 assert(((Rsrc && Off) || (!Rsrc && !Off)) && 1743 "Can't have a resource but no offset"); 1744 if (Rsrc) 1745 RsrcParts[I] = Rsrc; 1746 if (Off) 1747 OffParts[I] = Off; 1748 } 1749 processConditionals(); 1750 killAndReplaceSplitInstructions(Originals); 1751 1752 // Clean up after ourselves to save on memory. 1753 RsrcParts.clear(); 1754 OffParts.clear(); 1755 SplitUsers.clear(); 1756 Conditionals.clear(); 1757 ConditionalTemps.clear(); 1758 } 1759 1760 namespace { 1761 class AMDGPULowerBufferFatPointers : public ModulePass { 1762 public: 1763 static char ID; 1764 1765 AMDGPULowerBufferFatPointers() : ModulePass(ID) { 1766 initializeAMDGPULowerBufferFatPointersPass( 1767 *PassRegistry::getPassRegistry()); 1768 } 1769 1770 bool run(Module &M, const TargetMachine &TM); 1771 bool runOnModule(Module &M) override; 1772 1773 void getAnalysisUsage(AnalysisUsage &AU) const override; 1774 }; 1775 } // namespace 1776 1777 /// Returns true if there are values that have a buffer fat pointer in them, 1778 /// which means we'll need to perform rewrites on this function. As a side 1779 /// effect, this will populate the type remapping cache. 1780 static bool containsBufferFatPointers(const Function &F, 1781 BufferFatPtrToStructTypeMap *TypeMap) { 1782 bool HasFatPointers = false; 1783 for (const BasicBlock &BB : F) { 1784 for (const Instruction &I : BB) { 1785 HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); 1786 for (const Use &U : I.operands()) 1787 if (auto *C = dyn_cast<Constant>(U.get())) 1788 HasFatPointers |= isBufferFatPtrConst(C); 1789 } 1790 } 1791 return HasFatPointers; 1792 } 1793 1794 static bool hasFatPointerInterface(const Function &F, 1795 BufferFatPtrToStructTypeMap *TypeMap) { 1796 Type *Ty = F.getFunctionType(); 1797 return Ty != TypeMap->remapType(Ty); 1798 } 1799 1800 /// Move the body of `OldF` into a new function, returning it. 1801 static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, 1802 ValueToValueMapTy &CloneMap) { 1803 bool IsIntrinsic = OldF->isIntrinsic(); 1804 Function *NewF = 1805 Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); 1806 NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat; 1807 NewF->copyAttributesFrom(OldF); 1808 NewF->copyMetadata(OldF, 0); 1809 NewF->takeName(OldF); 1810 NewF->updateAfterNameChange(); 1811 NewF->setDLLStorageClass(OldF->getDLLStorageClass()); 1812 OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); 1813 1814 while (!OldF->empty()) { 1815 BasicBlock *BB = &OldF->front(); 1816 BB->removeFromParent(); 1817 BB->insertInto(NewF); 1818 CloneMap[BB] = BB; 1819 for (Instruction &I : *BB) { 1820 CloneMap[&I] = &I; 1821 } 1822 } 1823 1824 AttributeMask PtrOnlyAttrs; 1825 for (auto K : 1826 {Attribute::Dereferenceable, Attribute::DereferenceableOrNull, 1827 Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree, 1828 Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone, 1829 Attribute::ReadOnly, Attribute::WriteOnly}) { 1830 PtrOnlyAttrs.addAttribute(K); 1831 } 1832 SmallVector<AttributeSet> ArgAttrs; 1833 AttributeList OldAttrs = OldF->getAttributes(); 1834 1835 for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) { 1836 CloneMap[&NewArg] = &OldArg; 1837 NewArg.takeName(&OldArg); 1838 Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); 1839 // Temporarily mutate type of `NewArg` to allow RAUW to work. 1840 NewArg.mutateType(OldArgTy); 1841 OldArg.replaceAllUsesWith(&NewArg); 1842 NewArg.mutateType(NewArgTy); 1843 1844 AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); 1845 // Intrinsics get their attributes fixed later. 1846 if (OldArgTy != NewArgTy && !IsIntrinsic) 1847 ArgAttr = ArgAttr.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1848 ArgAttrs.push_back(ArgAttr); 1849 } 1850 AttributeSet RetAttrs = OldAttrs.getRetAttrs(); 1851 if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) 1852 RetAttrs = RetAttrs.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1853 NewF->setAttributes(AttributeList::get( 1854 NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); 1855 return NewF; 1856 } 1857 1858 static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { 1859 for (Argument &A : F->args()) 1860 CloneMap[&A] = &A; 1861 for (BasicBlock &BB : *F) { 1862 CloneMap[&BB] = &BB; 1863 for (Instruction &I : BB) 1864 CloneMap[&I] = &I; 1865 } 1866 } 1867 1868 bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { 1869 bool Changed = false; 1870 const DataLayout &DL = M.getDataLayout(); 1871 // Record the functions which need to be remapped. 1872 // The second element of the pair indicates whether the function has to have 1873 // its arguments or return types adjusted. 1874 SmallVector<std::pair<Function *, bool>> NeedsRemap; 1875 1876 BufferFatPtrToStructTypeMap StructTM(DL); 1877 BufferFatPtrToIntTypeMap IntTM(DL); 1878 for (const GlobalVariable &GV : M.globals()) { 1879 if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) 1880 report_fatal_error("Global variables with a buffer fat pointer address " 1881 "space (7) are not supported"); 1882 Type *VT = GV.getValueType(); 1883 if (VT != StructTM.remapType(VT)) 1884 report_fatal_error("Global variables that contain buffer fat pointers " 1885 "(address space 7 pointers) are unsupported. Use " 1886 "buffer resource pointers (address space 8) instead."); 1887 } 1888 1889 StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); 1890 for (Function &F : M.functions()) { 1891 bool InterfaceChange = hasFatPointerInterface(F, &StructTM); 1892 bool BodyChanges = containsBufferFatPointers(F, &StructTM); 1893 Changed |= MemOpsRewrite.processFunction(F); 1894 if (InterfaceChange || BodyChanges) 1895 NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); 1896 } 1897 if (NeedsRemap.empty()) 1898 return Changed; 1899 1900 SmallVector<Function *> NeedsPostProcess; 1901 SmallVector<Function *> Intrinsics; 1902 // Keep one big map so as to memoize constants across functions. 1903 ValueToValueMapTy CloneMap; 1904 FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL); 1905 1906 ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); 1907 for (auto [F, InterfaceChange] : NeedsRemap) { 1908 Function *NewF = F; 1909 if (InterfaceChange) 1910 NewF = moveFunctionAdaptingType( 1911 F, cast<FunctionType>(StructTM.remapType(F->getFunctionType())), 1912 CloneMap); 1913 else 1914 makeCloneInPraceMap(F, CloneMap); 1915 LowerInFuncs.remapFunction(*NewF); 1916 if (NewF->isIntrinsic()) 1917 Intrinsics.push_back(NewF); 1918 else 1919 NeedsPostProcess.push_back(NewF); 1920 if (InterfaceChange) { 1921 F->replaceAllUsesWith(NewF); 1922 F->eraseFromParent(); 1923 } 1924 Changed = true; 1925 } 1926 StructTM.clear(); 1927 IntTM.clear(); 1928 CloneMap.clear(); 1929 1930 SplitPtrStructs Splitter(M.getContext(), &TM); 1931 for (Function *F : NeedsPostProcess) 1932 Splitter.processFunction(*F); 1933 for (Function *F : Intrinsics) { 1934 if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { 1935 F->eraseFromParent(); 1936 } else { 1937 std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F); 1938 if (NewF) 1939 F->replaceAllUsesWith(*NewF); 1940 } 1941 } 1942 return Changed; 1943 } 1944 1945 bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { 1946 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 1947 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 1948 return run(M, TM); 1949 } 1950 1951 char AMDGPULowerBufferFatPointers::ID = 0; 1952 1953 char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; 1954 1955 void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { 1956 AU.addRequired<TargetPassConfig>(); 1957 } 1958 1959 #define PASS_DESC "Lower buffer fat pointer operations to buffer resources" 1960 INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, 1961 false, false) 1962 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 1963 INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, 1964 false) 1965 #undef PASS_DESC 1966 1967 ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { 1968 return new AMDGPULowerBufferFatPointers(); 1969 } 1970 1971 PreservedAnalyses 1972 AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { 1973 return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() 1974 : PreservedAnalyses::all(); 1975 } 1976