1 //===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers operations on buffer fat pointers (addrspace 7) to 10 // operations on buffer resources (addrspace 8) and is needed for correct 11 // codegen. 12 // 13 // # Background 14 // 15 // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists 16 // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. 17 // The buffer resource part needs to be it needs to be a "raw" buffer resource 18 // (it must have a stride of 0 and bounds checks must be in raw buffer mode 19 // or disabled). 20 // 21 // When these requirements are met, a buffer resource can be treated as a 22 // typical (though quite wide) pointer that follows typical LLVM pointer 23 // semantics. This allows the frontend to reason about such buffers (which are 24 // often encountered in the context of SPIR-V kernels). 25 // 26 // However, because of their non-power-of-2 size, these fat pointers cannot be 27 // present during translation to MIR (though this restriction may be lifted 28 // during the transition to GlobalISel). Therefore, this pass is needed in order 29 // to correctly implement these fat pointers. 30 // 31 // The resource intrinsics take the resource part (the address space 8 pointer) 32 // and the offset part (the 32-bit integer) as separate arguments. In addition, 33 // many users of these buffers manipulate the offset while leaving the resource 34 // part alone. For these reasons, we want to typically separate the resource 35 // and offset parts into separate variables, but combine them together when 36 // encountering cases where this is required, such as by inserting these values 37 // into aggretates or moving them to memory. 38 // 39 // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) 40 // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), 41 // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes 42 // `{vector<Nxp8>, vector<Nxi32 >}` and its component parts. 43 // 44 // # Implementation 45 // 46 // This pass proceeds in three main phases: 47 // 48 // ## Rewriting loads and stores of p7 49 // 50 // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, 51 // including aggregates containing such pointers, to ones that use `i160`. This 52 // is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and 53 // allocas and, if the loaded or stored type contains `ptr addrspace(7)`, 54 // rewrites that type to one where the p7s are replaced by i160s, copying other 55 // parts of aggregates as needed. In the case of a store, each pointer is 56 // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. 57 // This same transformation is applied to vectors of pointers. 58 // 59 // Such a transformation allows the later phases of the pass to not need 60 // to handle buffer fat pointers moving to and from memory, where we load 61 // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation 62 // and `Nxi60` directly. Instead, that transposing action (where the vectors 63 // of resources and vectors of offsets are concatentated before being stored to 64 // memory) are handled through implementing `inttoptr` and `ptrtoint` only. 65 // 66 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the 67 // hardware does not include a 160-bit atomic. 68 // 69 // ## Type remapping 70 // 71 // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers 72 // to the corresponding struct type, which has a resource part and an offset 73 // part. 74 // 75 // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` 76 // to, usually by way of `setType`ing values. Constants are handled here 77 // because there isn't a good way to fix them up later. 78 // 79 // This has the downside of leaving the IR in an invalid state (for example, 80 // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), 81 // but all such invalid states will be resolved by the third phase. 82 // 83 // Functions that don't take buffer fat pointers are modified in place. Those 84 // that do take such pointers have their basic blocks moved to a new function 85 // with arguments that are {ptr addrspace(8), i32} arguments and return values. 86 // This phase also records intrinsics so that they can be remangled or deleted 87 // later. 88 // 89 // 90 // ## Splitting pointer structs 91 // 92 // The meat of this pass consists of defining semantics for operations that 93 // produce or consume [vectors of] buffer fat pointers in terms of their 94 // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` 95 // visitor. 96 // 97 // In the first pass through each function that is being lowered, the splitter 98 // inserts new instructions to implement the split-structures behavior, which is 99 // needed for correctness and performance. It records a list of "split users", 100 // instructions that are being replaced by operations on the resource and offset 101 // parts. 102 // 103 // Split users do not necessarily need to produce parts themselves ( 104 // a `load float, ptr addrspace(7)` does not, for example), but, if they do not 105 // generate fat buffer pointers, they must RAUW in their replacement 106 // instructions during the initial visit. 107 // 108 // When these new instructions are created, they use the split parts recorded 109 // for their initial arguments in order to generate their replacements, creating 110 // a parallel set of instructions that does not refer to the original fat 111 // pointer values but instead to their resource and offset components. 112 // 113 // Instructions, such as `extractvalue`, that produce buffer fat pointers from 114 // sources that do not have split parts, have such parts generated using 115 // `extractvalue`. This is also the initial handling of PHI nodes, which 116 // are then cleaned up. 117 // 118 // ### Conditionals 119 // 120 // PHI nodes are initially given resource parts via `extractvalue`. However, 121 // this is not an efficient rewrite of such nodes, as, in most cases, the 122 // resource part in a conditional or loop remains constant throughout the loop 123 // and only the offset varies. Failing to optimize away these constant resources 124 // would cause additional registers to be sent around loops and might lead to 125 // waterfall loops being generated for buffer operations due to the 126 // "non-uniform" resource argument. 127 // 128 // Therefore, after all instructions have been visited, the pointer splitter 129 // post-processes all encountered conditionals. Given a PHI node or select, 130 // getPossibleRsrcRoots() collects all values that the resource parts of that 131 // conditional's input could come from as well as collecting all conditional 132 // instructions encountered during the search. If, after filtering out the 133 // initial node itself, the set of encountered conditionals is a subset of the 134 // potential roots and there is a single potential resource that isn't in the 135 // conditional set, that value is the only possible value the resource argument 136 // could have throughout the control flow. 137 // 138 // If that condition is met, then a PHI node can have its resource part changed 139 // to the singleton value and then be replaced by a PHI on the offsets. 140 // Otherwise, each PHI node is split into two, one for the resource part and one 141 // for the offset part, which replace the temporary `extractvalue` instructions 142 // that were added during the first pass. 143 // 144 // Similar logic applies to `select`, where 145 // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` 146 // can be split into `%z.rsrc = %x.rsrc` and 147 // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` 148 // if both `%x` and `%y` have the same resource part, but two `select` 149 // operations will be needed if they do not. 150 // 151 // ### Final processing 152 // 153 // After conditionals have been cleaned up, the IR for each function is 154 // rewritten to remove all the old instructions that have been split up. 155 // 156 // Any instruction that used to produce a buffer fat pointer (and therefore now 157 // produces a resource-and-offset struct after type remapping) is 158 // replaced as follows: 159 // 1. All debug value annotations are cloned to reflect that the resource part 160 // and offset parts are computed separately and constitute different 161 // fragments of the underlying source language variable. 162 // 2. All uses that were themselves split are replaced by a `poison` of the 163 // struct type, as they will themselves be erased soon. This rule, combined 164 // with debug handling, should leave the use lists of split instructions 165 // empty in almost all cases. 166 // 3. If a user of the original struct-valued result remains, the structure 167 // needed for the new types to work is constructed out of the newly-defined 168 // parts, and the original instruction is replaced by this structure 169 // before being erased. Instructions requiring this construction include 170 // `ret` and `insertvalue`. 171 // 172 // # Consequences 173 // 174 // This pass does not alter the CFG. 175 // 176 // Alias analysis information will become coarser, as the LLVM alias analyzer 177 // cannot handle the buffer intrinsics. Specifically, while we can determine 178 // that the following two loads do not alias: 179 // ``` 180 // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 181 // %a = load i32, ptr addrspace(7) %x 182 // %b = load i32, ptr addrspace(7) %y 183 // ``` 184 // we cannot (except through some code that runs during scheduling) determine 185 // that the rewritten loads below do not alias. 186 // ``` 187 // %y.off = add i32 %x.off, 1 188 // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 189 // %x.off, ...) 190 // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) 191 // %x.rsrc, i32 %y.off, ...) 192 // ``` 193 // However, existing alias information is preserved. 194 //===----------------------------------------------------------------------===// 195 196 #include "AMDGPU.h" 197 #include "AMDGPUTargetMachine.h" 198 #include "GCNSubtarget.h" 199 #include "SIDefines.h" 200 #include "llvm/ADT/SetOperations.h" 201 #include "llvm/ADT/SmallVector.h" 202 #include "llvm/Analysis/ConstantFolding.h" 203 #include "llvm/Analysis/Utils/Local.h" 204 #include "llvm/CodeGen/TargetPassConfig.h" 205 #include "llvm/IR/AttributeMask.h" 206 #include "llvm/IR/Constants.h" 207 #include "llvm/IR/DebugInfo.h" 208 #include "llvm/IR/DerivedTypes.h" 209 #include "llvm/IR/IRBuilder.h" 210 #include "llvm/IR/InstIterator.h" 211 #include "llvm/IR/InstVisitor.h" 212 #include "llvm/IR/Instructions.h" 213 #include "llvm/IR/Intrinsics.h" 214 #include "llvm/IR/IntrinsicsAMDGPU.h" 215 #include "llvm/IR/Metadata.h" 216 #include "llvm/IR/Operator.h" 217 #include "llvm/IR/PatternMatch.h" 218 #include "llvm/IR/ReplaceConstant.h" 219 #include "llvm/InitializePasses.h" 220 #include "llvm/Pass.h" 221 #include "llvm/Support/AtomicOrdering.h" 222 #include "llvm/Support/Debug.h" 223 #include "llvm/Support/ErrorHandling.h" 224 #include "llvm/Transforms/Utils/Cloning.h" 225 #include "llvm/Transforms/Utils/Local.h" 226 #include "llvm/Transforms/Utils/ValueMapper.h" 227 228 #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" 229 230 using namespace llvm; 231 232 static constexpr unsigned BufferOffsetWidth = 32; 233 234 namespace { 235 /// Recursively replace instances of ptr addrspace(7) and vector<Nxptr 236 /// addrspace(7)> with some other type as defined by the relevant subclass. 237 class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { 238 DenseMap<Type *, Type *> Map; 239 240 Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl<StructType *> &Seen); 241 242 protected: 243 virtual Type *remapScalar(PointerType *PT) = 0; 244 virtual Type *remapVector(VectorType *VT) = 0; 245 246 const DataLayout &DL; 247 248 public: 249 BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} 250 Type *remapType(Type *SrcTy) override; 251 void clear() { Map.clear(); } 252 }; 253 254 /// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to 255 /// vector<Nxi60> in order to correctly handling loading/storing these values 256 /// from memory. 257 class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { 258 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 259 260 protected: 261 Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } 262 Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } 263 }; 264 265 /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset 266 /// parts of the pointer) so that we can easily rewrite operations on these 267 /// values that aren't loading them from or storing them to memory. 268 class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { 269 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; 270 271 protected: 272 Type *remapScalar(PointerType *PT) override; 273 Type *remapVector(VectorType *VT) override; 274 }; 275 } // namespace 276 277 // This code is adapted from the type remapper in lib/Linker/IRMover.cpp 278 Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( 279 Type *Ty, SmallPtrSetImpl<StructType *> &Seen) { 280 Type **Entry = &Map[Ty]; 281 if (*Entry) 282 return *Entry; 283 if (auto *PT = dyn_cast<PointerType>(Ty)) { 284 if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 285 return *Entry = remapScalar(PT); 286 } 287 } 288 if (auto *VT = dyn_cast<VectorType>(Ty)) { 289 auto *PT = dyn_cast<PointerType>(VT->getElementType()); 290 if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { 291 return *Entry = remapVector(VT); 292 } 293 return *Entry = Ty; 294 } 295 // Whether the type is one that is structurally uniqued - that is, if it is 296 // not a named struct (the only kind of type where multiple structurally 297 // identical types that have a distinct `Type*`) 298 StructType *TyAsStruct = dyn_cast<StructType>(Ty); 299 bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral(); 300 // Base case for ints, floats, opaque pointers, and so on, which don't 301 // require recursion. 302 if (Ty->getNumContainedTypes() == 0 && IsUniqued) 303 return *Entry = Ty; 304 if (!IsUniqued) { 305 // Create a dummy type for recursion purposes. 306 if (!Seen.insert(TyAsStruct).second) { 307 StructType *Placeholder = StructType::create(Ty->getContext()); 308 return *Entry = Placeholder; 309 } 310 } 311 bool Changed = false; 312 SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr); 313 for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { 314 Type *OldElem = Ty->getContainedType(I); 315 Type *NewElem = remapTypeImpl(OldElem, Seen); 316 ElementTypes[I] = NewElem; 317 Changed |= (OldElem != NewElem); 318 } 319 // Recursive calls to remapTypeImpl() may have invalidated pointer. 320 Entry = &Map[Ty]; 321 if (!Changed) { 322 return *Entry = Ty; 323 } 324 if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) 325 return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); 326 if (auto *FnTy = dyn_cast<FunctionType>(Ty)) 327 return *Entry = FunctionType::get(ElementTypes[0], 328 ArrayRef(ElementTypes).slice(1), 329 FnTy->isVarArg()); 330 if (auto *STy = dyn_cast<StructType>(Ty)) { 331 // Genuine opaque types don't have a remapping. 332 if (STy->isOpaque()) 333 return *Entry = Ty; 334 bool IsPacked = STy->isPacked(); 335 if (IsUniqued) 336 return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); 337 SmallString<16> Name(STy->getName()); 338 STy->setName(""); 339 Type **RecursionEntry = &Map[Ty]; 340 if (*RecursionEntry) { 341 auto *Placeholder = cast<StructType>(*RecursionEntry); 342 Placeholder->setBody(ElementTypes, IsPacked); 343 Placeholder->setName(Name); 344 return *Entry = Placeholder; 345 } 346 return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, 347 IsPacked); 348 } 349 llvm_unreachable("Unknown type of type that contains elements"); 350 } 351 352 Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { 353 SmallPtrSet<StructType *, 2> Visited; 354 return remapTypeImpl(SrcTy, Visited); 355 } 356 357 Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { 358 LLVMContext &Ctx = PT->getContext(); 359 return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), 360 IntegerType::get(Ctx, BufferOffsetWidth)); 361 } 362 363 Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { 364 ElementCount EC = VT->getElementCount(); 365 LLVMContext &Ctx = VT->getContext(); 366 Type *RsrcVec = 367 VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); 368 Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); 369 return StructType::get(RsrcVec, OffVec); 370 } 371 372 static bool isBufferFatPtrOrVector(Type *Ty) { 373 if (auto *PT = dyn_cast<PointerType>(Ty->getScalarType())) 374 return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; 375 return false; 376 } 377 378 // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of 379 // those types. Used to quickly skip instructions we don't need to process. 380 static bool isSplitFatPtr(Type *Ty) { 381 auto *ST = dyn_cast<StructType>(Ty); 382 if (!ST) 383 return false; 384 if (!ST->isLiteral() || ST->getNumElements() != 2) 385 return false; 386 auto *MaybeRsrc = 387 dyn_cast<PointerType>(ST->getElementType(0)->getScalarType()); 388 auto *MaybeOff = 389 dyn_cast<IntegerType>(ST->getElementType(1)->getScalarType()); 390 return MaybeRsrc && MaybeOff && 391 MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && 392 MaybeOff->getBitWidth() == BufferOffsetWidth; 393 } 394 395 // True if the result type or any argument types are buffer fat pointers. 396 static bool isBufferFatPtrConst(Constant *C) { 397 Type *T = C->getType(); 398 return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) { 399 return isBufferFatPtrOrVector(U.get()->getType()); 400 }); 401 } 402 403 namespace { 404 /// Convert [vectors of] buffer fat pointers to integers when they are read from 405 /// or stored to memory. This ensures that these pointers will have the same 406 /// memory layout as before they are lowered, even though they will no longer 407 /// have their previous layout in registers/in the program (they'll be broken 408 /// down into resource and offset parts). This has the downside of imposing 409 /// marshalling costs when reading or storing these values, but since placing 410 /// such pointers into memory is an uncommon operation at best, we feel that 411 /// this cost is acceptable for better performance in the common case. 412 class StoreFatPtrsAsIntsVisitor 413 : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> { 414 BufferFatPtrToIntTypeMap *TypeMap; 415 416 ValueToValueMapTy ConvertedForStore; 417 418 IRBuilder<> IRB; 419 420 // Convert all the buffer fat pointers within the input value to inttegers 421 // so that it can be stored in memory. 422 Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); 423 // Convert all the i160s that need to be buffer fat pointers (as specified) 424 // by the To type) into those pointers to preserve the semantics of the rest 425 // of the program. 426 Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); 427 428 public: 429 StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) 430 : TypeMap(TypeMap), IRB(Ctx) {} 431 bool processFunction(Function &F); 432 433 bool visitInstruction(Instruction &I) { return false; } 434 bool visitAllocaInst(AllocaInst &I); 435 bool visitLoadInst(LoadInst &LI); 436 bool visitStoreInst(StoreInst &SI); 437 bool visitGetElementPtrInst(GetElementPtrInst &I); 438 }; 439 } // namespace 440 441 Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, 442 const Twine &Name) { 443 if (From == To) 444 return V; 445 ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); 446 if (Find != ConvertedForStore.end()) 447 return Find->second; 448 if (isBufferFatPtrOrVector(From)) { 449 Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); 450 ConvertedForStore[V] = Cast; 451 return Cast; 452 } 453 if (From->getNumContainedTypes() == 0) 454 return V; 455 // Structs, arrays, and other compound types. 456 Value *Ret = PoisonValue::get(To); 457 if (auto *AT = dyn_cast<ArrayType>(From)) { 458 Type *FromPart = AT->getArrayElementType(); 459 Type *ToPart = cast<ArrayType>(To)->getElementType(); 460 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 461 Value *Field = IRB.CreateExtractValue(V, I); 462 Value *NewField = 463 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); 464 Ret = IRB.CreateInsertValue(Ret, NewField, I); 465 } 466 } else { 467 for (auto [Idx, FromPart, ToPart] : 468 enumerate(From->subtypes(), To->subtypes())) { 469 Value *Field = IRB.CreateExtractValue(V, Idx); 470 Value *NewField = 471 fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 472 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 473 } 474 } 475 ConvertedForStore[V] = Ret; 476 return Ret; 477 } 478 479 Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, 480 const Twine &Name) { 481 if (From == To) 482 return V; 483 if (isBufferFatPtrOrVector(To)) { 484 Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); 485 return Cast; 486 } 487 if (From->getNumContainedTypes() == 0) 488 return V; 489 // Structs, arrays, and other compound types. 490 Value *Ret = PoisonValue::get(To); 491 if (auto *AT = dyn_cast<ArrayType>(From)) { 492 Type *FromPart = AT->getArrayElementType(); 493 Type *ToPart = cast<ArrayType>(To)->getElementType(); 494 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { 495 Value *Field = IRB.CreateExtractValue(V, I); 496 Value *NewField = 497 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); 498 Ret = IRB.CreateInsertValue(Ret, NewField, I); 499 } 500 } else { 501 for (auto [Idx, FromPart, ToPart] : 502 enumerate(From->subtypes(), To->subtypes())) { 503 Value *Field = IRB.CreateExtractValue(V, Idx); 504 Value *NewField = 505 intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); 506 Ret = IRB.CreateInsertValue(Ret, NewField, Idx); 507 } 508 } 509 return Ret; 510 } 511 512 bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { 513 bool Changed = false; 514 // The visitors will mutate GEPs and allocas, but will push loads and stores 515 // to the worklist to avoid invalidation. 516 for (Instruction &I : make_early_inc_range(instructions(F))) { 517 Changed |= visit(I); 518 } 519 ConvertedForStore.clear(); 520 return Changed; 521 } 522 523 bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { 524 Type *Ty = I.getAllocatedType(); 525 Type *NewTy = TypeMap->remapType(Ty); 526 if (Ty == NewTy) 527 return false; 528 I.setAllocatedType(NewTy); 529 return true; 530 } 531 532 bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { 533 Type *Ty = I.getSourceElementType(); 534 Type *NewTy = TypeMap->remapType(Ty); 535 if (Ty == NewTy) 536 return false; 537 // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so 538 // make sure GEPs don't have different semantics with the new type. 539 I.setSourceElementType(NewTy); 540 I.setResultElementType(TypeMap->remapType(I.getResultElementType())); 541 return true; 542 } 543 544 bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { 545 Type *Ty = LI.getType(); 546 Type *IntTy = TypeMap->remapType(Ty); 547 if (Ty == IntTy) 548 return false; 549 550 IRB.SetInsertPoint(&LI); 551 auto *NLI = cast<LoadInst>(LI.clone()); 552 NLI->mutateType(IntTy); 553 NLI = IRB.Insert(NLI); 554 copyMetadataForLoad(*NLI, LI); 555 NLI->takeName(&LI); 556 557 Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); 558 LI.replaceAllUsesWith(CastBack); 559 LI.eraseFromParent(); 560 return true; 561 } 562 563 bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { 564 Value *V = SI.getValueOperand(); 565 Type *Ty = V->getType(); 566 Type *IntTy = TypeMap->remapType(Ty); 567 if (Ty == IntTy) 568 return false; 569 570 IRB.SetInsertPoint(&SI); 571 Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); 572 for (auto *Dbg : at::getAssignmentMarkers(&SI)) 573 Dbg->setValue(IntV); 574 575 SI.setOperand(0, IntV); 576 return true; 577 } 578 579 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered 580 /// buffer fat pointer constant. 581 static std::pair<Constant *, Constant *> 582 splitLoweredFatBufferConst(Constant *C) { 583 assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer"); 584 return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u)); 585 } 586 587 namespace { 588 /// Handle the remapping of ptr addrspace(7) constants. 589 class FatPtrConstMaterializer final : public ValueMaterializer { 590 BufferFatPtrToStructTypeMap *TypeMap; 591 // An internal mapper that is used to recurse into the arguments of constants. 592 // While the documentation for `ValueMapper` specifies not to use it 593 // recursively, examination of the logic in mapValue() shows that it can 594 // safely be used recursively when handling constants, like it does in its own 595 // logic. 596 ValueMapper InternalMapper; 597 598 Constant *materializeBufferFatPtrConst(Constant *C); 599 600 public: 601 // UnderlyingMap is the value map this materializer will be filling. 602 FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, 603 ValueToValueMapTy &UnderlyingMap) 604 : TypeMap(TypeMap), 605 InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {} 606 virtual ~FatPtrConstMaterializer() = default; 607 608 Value *materialize(Value *V) override; 609 }; 610 } // namespace 611 612 Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { 613 Type *SrcTy = C->getType(); 614 auto *NewTy = dyn_cast<StructType>(TypeMap->remapType(SrcTy)); 615 if (C->isNullValue()) 616 return ConstantAggregateZero::getNullValue(NewTy); 617 if (isa<PoisonValue>(C)) { 618 return ConstantStruct::get(NewTy, 619 {PoisonValue::get(NewTy->getElementType(0)), 620 PoisonValue::get(NewTy->getElementType(1))}); 621 } 622 if (isa<UndefValue>(C)) { 623 return ConstantStruct::get(NewTy, 624 {UndefValue::get(NewTy->getElementType(0)), 625 UndefValue::get(NewTy->getElementType(1))}); 626 } 627 628 if (auto *VC = dyn_cast<ConstantVector>(C)) { 629 if (Constant *S = VC->getSplatValue()) { 630 Constant *NewS = InternalMapper.mapConstant(*S); 631 if (!NewS) 632 return nullptr; 633 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); 634 auto EC = VC->getType()->getElementCount(); 635 return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), 636 ConstantVector::getSplat(EC, Off)}); 637 } 638 SmallVector<Constant *> Rsrcs; 639 SmallVector<Constant *> Offs; 640 for (Value *Op : VC->operand_values()) { 641 auto *NewOp = dyn_cast_or_null<Constant>(InternalMapper.mapValue(*Op)); 642 if (!NewOp) 643 return nullptr; 644 auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); 645 Rsrcs.push_back(Rsrc); 646 Offs.push_back(Off); 647 } 648 Constant *RsrcVec = ConstantVector::get(Rsrcs); 649 Constant *OffVec = ConstantVector::get(Offs); 650 return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); 651 } 652 653 if (isa<GlobalValue>(C)) 654 report_fatal_error("Global values containing ptr addrspace(7) (buffer " 655 "fat pointer) values are not supported"); 656 657 if (isa<ConstantExpr>(C)) 658 report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer " 659 "fat pointer) values should have been expanded earlier"); 660 661 return nullptr; 662 } 663 664 Value *FatPtrConstMaterializer::materialize(Value *V) { 665 Constant *C = dyn_cast<Constant>(V); 666 if (!C) 667 return nullptr; 668 // Structs and other types that happen to contain fat pointers get remapped 669 // by the mapValue() logic. 670 if (!isBufferFatPtrConst(C)) 671 return nullptr; 672 return materializeBufferFatPtrConst(C); 673 } 674 675 using PtrParts = std::pair<Value *, Value *>; 676 namespace { 677 // The visitor returns the resource and offset parts for an instruction if they 678 // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful 679 // value mapping. 680 class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> { 681 ValueToValueMapTy RsrcParts; 682 ValueToValueMapTy OffParts; 683 684 // Track instructions that have been rewritten into a user of the component 685 // parts of their ptr addrspace(7) input. Instructions that produced 686 // ptr addrspace(7) parts should **not** be RAUW'd before being added to this 687 // set, as that replacement will be handled in a post-visit step. However, 688 // instructions that yield values that aren't fat pointers (ex. ptrtoint) 689 // should RAUW themselves with new instructions that use the split parts 690 // of their arguments during processing. 691 DenseSet<Instruction *> SplitUsers; 692 693 // Nodes that need a second look once we've computed the parts for all other 694 // instructions to see if, for example, we really need to phi on the resource 695 // part. 696 SmallVector<Instruction *> Conditionals; 697 // Temporary instructions produced while lowering conditionals that should be 698 // killed. 699 SmallVector<Instruction *> ConditionalTemps; 700 701 // Subtarget info, needed for determining what cache control bits to set. 702 const TargetMachine *TM; 703 const GCNSubtarget *ST; 704 705 IRBuilder<> IRB; 706 707 // Copy metadata between instructions if applicable. 708 void copyMetadata(Value *Dest, Value *Src); 709 710 // Get the resource and offset parts of the value V, inserting appropriate 711 // extractvalue calls if needed. 712 PtrParts getPtrParts(Value *V); 713 714 // Given an instruction that could produce multiple resource parts (a PHI or 715 // select), collect the set of possible instructions that could have provided 716 // its resource parts that it could have (the `Roots`) and the set of 717 // conditional instructions visited during the search (`Seen`). If, after 718 // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset 719 // of `Roots` and `Roots - Seen` contains one element, the resource part of 720 // that element can replace the resource part of all other elements in `Seen`. 721 void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots, 722 SmallPtrSetImpl<Value *> &Seen); 723 void processConditionals(); 724 725 // If an instruction hav been split into resource and offset parts, 726 // delete that instruction. If any of its uses have not themselves been split 727 // into parts (for example, an insertvalue), construct the structure 728 // that the type rewrites declared should be produced by the dying instruction 729 // and use that. 730 // Also, kill the temporary extractvalue operations produced by the two-stage 731 // lowering of PHIs and conditionals. 732 void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs); 733 734 void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx); 735 void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 736 void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); 737 Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, 738 Align Alignment, AtomicOrdering Order, 739 bool IsVolatile, SyncScope::ID SSID); 740 741 public: 742 SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) 743 : TM(TM), ST(nullptr), IRB(Ctx) {} 744 745 void processFunction(Function &F); 746 747 PtrParts visitInstruction(Instruction &I); 748 PtrParts visitLoadInst(LoadInst &LI); 749 PtrParts visitStoreInst(StoreInst &SI); 750 PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); 751 PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); 752 PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); 753 754 PtrParts visitPtrToIntInst(PtrToIntInst &PI); 755 PtrParts visitIntToPtrInst(IntToPtrInst &IP); 756 PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); 757 PtrParts visitICmpInst(ICmpInst &Cmp); 758 PtrParts visitFreezeInst(FreezeInst &I); 759 760 PtrParts visitExtractElementInst(ExtractElementInst &I); 761 PtrParts visitInsertElementInst(InsertElementInst &I); 762 PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); 763 764 PtrParts visitPHINode(PHINode &PHI); 765 PtrParts visitSelectInst(SelectInst &SI); 766 767 PtrParts visitIntrinsicInst(IntrinsicInst &II); 768 }; 769 } // namespace 770 771 void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { 772 auto *DestI = dyn_cast<Instruction>(Dest); 773 auto *SrcI = dyn_cast<Instruction>(Src); 774 775 if (!DestI || !SrcI) 776 return; 777 778 DestI->copyMetadata(*SrcI); 779 } 780 781 PtrParts SplitPtrStructs::getPtrParts(Value *V) { 782 assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " 783 "of something that wasn't rewritten"); 784 auto *RsrcEntry = &RsrcParts[V]; 785 auto *OffEntry = &OffParts[V]; 786 if (*RsrcEntry && *OffEntry) 787 return {*RsrcEntry, *OffEntry}; 788 789 if (auto *C = dyn_cast<Constant>(V)) { 790 auto [Rsrc, Off] = splitLoweredFatBufferConst(C); 791 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 792 } 793 794 IRBuilder<>::InsertPointGuard Guard(IRB); 795 if (auto *I = dyn_cast<Instruction>(V)) { 796 LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); 797 auto [Rsrc, Off] = visit(*I); 798 if (Rsrc && Off) 799 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 800 // We'll be creating the new values after the relevant instruction. 801 // This instruction generates a value and so isn't a terminator. 802 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 803 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 804 } else if (auto *A = dyn_cast<Argument>(V)) { 805 IRB.SetInsertPointPastAllocas(A->getParent()); 806 IRB.SetCurrentDebugLocation(DebugLoc()); 807 } 808 Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); 809 Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); 810 return {*RsrcEntry = Rsrc, *OffEntry = Off}; 811 } 812 813 /// Returns the instruction that defines the resource part of the value V. 814 /// Note that this is not getUnderlyingObject(), since that looks through 815 /// operations like ptrmask which might modify the resource part. 816 /// 817 /// We can limit ourselves to just looking through GEPs followed by looking 818 /// through addrspacecasts because only those two operations preserve the 819 /// resource part, and because operations on an `addrspace(8)` (which is the 820 /// legal input to this addrspacecast) would produce a different resource part. 821 static Value *rsrcPartRoot(Value *V) { 822 while (auto *GEP = dyn_cast<GEPOperator>(V)) 823 V = GEP->getPointerOperand(); 824 while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(V)) 825 V = ASC->getPointerOperand(); 826 return V; 827 } 828 829 void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, 830 SmallPtrSetImpl<Value *> &Roots, 831 SmallPtrSetImpl<Value *> &Seen) { 832 if (auto *PHI = dyn_cast<PHINode>(I)) { 833 if (!Seen.insert(I).second) 834 return; 835 for (Value *In : PHI->incoming_values()) { 836 In = rsrcPartRoot(In); 837 Roots.insert(In); 838 if (isa<PHINode, SelectInst>(In)) 839 getPossibleRsrcRoots(cast<Instruction>(In), Roots, Seen); 840 } 841 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 842 if (!Seen.insert(SI).second) 843 return; 844 Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); 845 Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); 846 Roots.insert(TrueVal); 847 Roots.insert(FalseVal); 848 if (isa<PHINode, SelectInst>(TrueVal)) 849 getPossibleRsrcRoots(cast<Instruction>(TrueVal), Roots, Seen); 850 if (isa<PHINode, SelectInst>(FalseVal)) 851 getPossibleRsrcRoots(cast<Instruction>(FalseVal), Roots, Seen); 852 } else { 853 llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); 854 } 855 } 856 857 void SplitPtrStructs::processConditionals() { 858 SmallDenseMap<Instruction *, Value *> FoundRsrcs; 859 SmallPtrSet<Value *, 4> Roots; 860 SmallPtrSet<Value *, 4> Seen; 861 for (Instruction *I : Conditionals) { 862 // These have to exist by now because we've visited these nodes. 863 Value *Rsrc = RsrcParts[I]; 864 Value *Off = OffParts[I]; 865 assert(Rsrc && Off && "must have visited conditionals by now"); 866 867 std::optional<Value *> MaybeRsrc; 868 auto MaybeFoundRsrc = FoundRsrcs.find(I); 869 if (MaybeFoundRsrc != FoundRsrcs.end()) { 870 MaybeRsrc = MaybeFoundRsrc->second; 871 } else { 872 IRBuilder<>::InsertPointGuard Guard(IRB); 873 Roots.clear(); 874 Seen.clear(); 875 getPossibleRsrcRoots(I, Roots, Seen); 876 LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n"); 877 #ifndef NDEBUG 878 for (Value *V : Roots) 879 LLVM_DEBUG(dbgs() << "Root: " << *V << "\n"); 880 for (Value *V : Seen) 881 LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n"); 882 #endif 883 // If we are our own possible root, then we shouldn't block our 884 // replacement with a valid incoming value. 885 Roots.erase(I); 886 // We don't want to block the optimization for conditionals that don't 887 // refer to themselves but did see themselves during the traversal. 888 Seen.erase(I); 889 890 if (set_is_subset(Seen, Roots)) { 891 auto Diff = set_difference(Roots, Seen); 892 if (Diff.size() == 1) { 893 Value *RootVal = *Diff.begin(); 894 // Handle the case where previous loops already looked through 895 // an addrspacecast. 896 if (isSplitFatPtr(RootVal->getType())) 897 MaybeRsrc = std::get<0>(getPtrParts(RootVal)); 898 else 899 MaybeRsrc = RootVal; 900 } 901 } 902 } 903 904 if (auto *PHI = dyn_cast<PHINode>(I)) { 905 Value *NewRsrc; 906 StructType *PHITy = cast<StructType>(PHI->getType()); 907 IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef()); 908 IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); 909 if (MaybeRsrc) { 910 NewRsrc = *MaybeRsrc; 911 } else { 912 Type *RsrcTy = PHITy->getElementType(0); 913 auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); 914 RsrcPHI->takeName(Rsrc); 915 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 916 Value *VRsrc = std::get<0>(getPtrParts(V)); 917 RsrcPHI->addIncoming(VRsrc, BB); 918 } 919 copyMetadata(RsrcPHI, PHI); 920 NewRsrc = RsrcPHI; 921 } 922 923 Type *OffTy = PHITy->getElementType(1); 924 auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); 925 NewOff->takeName(Off); 926 for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { 927 assert(OffParts.count(V) && "An offset part had to be created by now"); 928 Value *VOff = std::get<1>(getPtrParts(V)); 929 NewOff->addIncoming(VOff, BB); 930 } 931 copyMetadata(NewOff, PHI); 932 933 // Note: We don't eraseFromParent() the temporaries because we don't want 934 // to put the corrections maps in an inconstent state. That'll be handed 935 // during the rest of the killing. Also, `ValueToValueMapTy` guarantees 936 // that references in that map will be updated as well. 937 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 938 ConditionalTemps.push_back(cast<Instruction>(Off)); 939 Rsrc->replaceAllUsesWith(NewRsrc); 940 Off->replaceAllUsesWith(NewOff); 941 942 // Save on recomputing the cycle traversals in known-root cases. 943 if (MaybeRsrc) 944 for (Value *V : Seen) 945 FoundRsrcs[cast<Instruction>(V)] = NewRsrc; 946 } else if (isa<SelectInst>(I)) { 947 if (MaybeRsrc) { 948 ConditionalTemps.push_back(cast<Instruction>(Rsrc)); 949 Rsrc->replaceAllUsesWith(*MaybeRsrc); 950 for (Value *V : Seen) 951 FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc; 952 } 953 } else { 954 llvm_unreachable("Only PHIs and selects go in the conditionals list"); 955 } 956 } 957 } 958 959 void SplitPtrStructs::killAndReplaceSplitInstructions( 960 SmallVectorImpl<Instruction *> &Origs) { 961 for (Instruction *I : ConditionalTemps) 962 I->eraseFromParent(); 963 964 for (Instruction *I : Origs) { 965 if (!SplitUsers.contains(I)) 966 continue; 967 968 SmallVector<DbgValueInst *> Dbgs; 969 findDbgValues(Dbgs, I); 970 for (auto *Dbg : Dbgs) { 971 IRB.SetInsertPoint(Dbg); 972 auto &DL = I->getModule()->getDataLayout(); 973 assert(isSplitFatPtr(I->getType()) && 974 "We should've RAUW'd away loads, stores, etc. at this point"); 975 auto *OffDbg = cast<DbgValueInst>(Dbg->clone()); 976 copyMetadata(OffDbg, Dbg); 977 auto [Rsrc, Off] = getPtrParts(I); 978 979 int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); 980 int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); 981 982 std::optional<DIExpression *> RsrcExpr = 983 DIExpression::createFragmentExpression(Dbg->getExpression(), 0, 984 RsrcSz); 985 std::optional<DIExpression *> OffExpr = 986 DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, 987 OffSz); 988 if (OffExpr) { 989 OffDbg->setExpression(*OffExpr); 990 OffDbg->replaceVariableLocationOp(I, Off); 991 IRB.Insert(OffDbg); 992 } else { 993 OffDbg->deleteValue(); 994 } 995 if (RsrcExpr) { 996 Dbg->setExpression(*RsrcExpr); 997 Dbg->replaceVariableLocationOp(I, Rsrc); 998 } else { 999 Dbg->replaceVariableLocationOp(I, UndefValue::get(I->getType())); 1000 } 1001 } 1002 1003 Value *Poison = PoisonValue::get(I->getType()); 1004 I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { 1005 if (const auto *UI = dyn_cast<Instruction>(U.getUser())) 1006 return SplitUsers.contains(UI); 1007 return false; 1008 }); 1009 1010 if (I->use_empty()) { 1011 I->eraseFromParent(); 1012 continue; 1013 } 1014 IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); 1015 IRB.SetCurrentDebugLocation(I->getDebugLoc()); 1016 auto [Rsrc, Off] = getPtrParts(I); 1017 Value *Struct = PoisonValue::get(I->getType()); 1018 Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); 1019 Struct = IRB.CreateInsertValue(Struct, Off, 1); 1020 copyMetadata(Struct, I); 1021 Struct->takeName(I); 1022 I->replaceAllUsesWith(Struct); 1023 I->eraseFromParent(); 1024 } 1025 } 1026 1027 void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { 1028 LLVMContext &Ctx = Intr->getContext(); 1029 Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A)); 1030 } 1031 1032 void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, 1033 SyncScope::ID SSID) { 1034 switch (Order) { 1035 case AtomicOrdering::Release: 1036 case AtomicOrdering::AcquireRelease: 1037 case AtomicOrdering::SequentiallyConsistent: 1038 IRB.CreateFence(AtomicOrdering::Release, SSID); 1039 break; 1040 default: 1041 break; 1042 } 1043 } 1044 1045 void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, 1046 SyncScope::ID SSID) { 1047 switch (Order) { 1048 case AtomicOrdering::Acquire: 1049 case AtomicOrdering::AcquireRelease: 1050 case AtomicOrdering::SequentiallyConsistent: 1051 IRB.CreateFence(AtomicOrdering::Acquire, SSID); 1052 break; 1053 default: 1054 break; 1055 } 1056 } 1057 1058 Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, 1059 Type *Ty, Align Alignment, 1060 AtomicOrdering Order, bool IsVolatile, 1061 SyncScope::ID SSID) { 1062 IRB.SetInsertPoint(I); 1063 1064 auto [Rsrc, Off] = getPtrParts(Ptr); 1065 SmallVector<Value *, 5> Args; 1066 if (Arg) 1067 Args.push_back(Arg); 1068 Args.push_back(Rsrc); 1069 Args.push_back(Off); 1070 insertPreMemOpFence(Order, SSID); 1071 // soffset is always 0 for these cases, where we always want any offset to be 1072 // part of bounds checking and we don't know which parts of the GEPs is 1073 // uniform. 1074 Args.push_back(IRB.getInt32(0)); 1075 1076 uint32_t Aux = 0; 1077 bool IsInvariant = 1078 (isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load)); 1079 bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal); 1080 // Atomic loads and stores need glc, atomic read-modify-write doesn't. 1081 bool IsOneWayAtomic = 1082 !isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic; 1083 if (IsOneWayAtomic) 1084 Aux |= AMDGPU::CPol::GLC; 1085 if (IsNonTemporal && !IsInvariant) 1086 Aux |= AMDGPU::CPol::SLC; 1087 if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10) 1088 Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0); 1089 if (IsVolatile) 1090 Aux |= AMDGPU::CPol::VOLATILE; 1091 Args.push_back(IRB.getInt32(Aux)); 1092 1093 Intrinsic::ID IID = Intrinsic::not_intrinsic; 1094 if (isa<LoadInst>(I)) 1095 // TODO: Do we need to do something about atomic loads? 1096 IID = Intrinsic::amdgcn_raw_ptr_buffer_load; 1097 else if (isa<StoreInst>(I)) 1098 IID = Intrinsic::amdgcn_raw_ptr_buffer_store; 1099 else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) { 1100 switch (RMW->getOperation()) { 1101 case AtomicRMWInst::Xchg: 1102 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; 1103 break; 1104 case AtomicRMWInst::Add: 1105 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; 1106 break; 1107 case AtomicRMWInst::Sub: 1108 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; 1109 break; 1110 case AtomicRMWInst::And: 1111 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; 1112 break; 1113 case AtomicRMWInst::Or: 1114 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; 1115 break; 1116 case AtomicRMWInst::Xor: 1117 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; 1118 break; 1119 case AtomicRMWInst::Max: 1120 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; 1121 break; 1122 case AtomicRMWInst::Min: 1123 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; 1124 break; 1125 case AtomicRMWInst::UMax: 1126 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; 1127 break; 1128 case AtomicRMWInst::UMin: 1129 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; 1130 break; 1131 case AtomicRMWInst::FAdd: 1132 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; 1133 break; 1134 case AtomicRMWInst::FMax: 1135 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; 1136 break; 1137 case AtomicRMWInst::FMin: 1138 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; 1139 break; 1140 case AtomicRMWInst::FSub: { 1141 report_fatal_error("atomic floating point subtraction not supported for " 1142 "buffer resources and should've been expanded away"); 1143 break; 1144 } 1145 case AtomicRMWInst::Nand: 1146 report_fatal_error("atomic nand not supported for buffer resources and " 1147 "should've been expanded away"); 1148 break; 1149 case AtomicRMWInst::UIncWrap: 1150 case AtomicRMWInst::UDecWrap: 1151 report_fatal_error("wrapping increment/decrement not supported for " 1152 "buffer resources and should've ben expanded away"); 1153 break; 1154 case AtomicRMWInst::BAD_BINOP: 1155 llvm_unreachable("Not sure how we got a bad binop"); 1156 } 1157 } 1158 1159 auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); 1160 copyMetadata(Call, I); 1161 setAlign(Call, Alignment, Arg ? 1 : 0); 1162 Call->takeName(I); 1163 1164 insertPostMemOpFence(Order, SSID); 1165 // The "no moving p7 directly" rewrites ensure that this load or store won't 1166 // itself need to be split into parts. 1167 SplitUsers.insert(I); 1168 I->replaceAllUsesWith(Call); 1169 return Call; 1170 } 1171 1172 PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { 1173 return {nullptr, nullptr}; 1174 } 1175 1176 PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { 1177 if (!isSplitFatPtr(LI.getPointerOperandType())) 1178 return {nullptr, nullptr}; 1179 handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), 1180 LI.getAlign(), LI.getOrdering(), LI.isVolatile(), 1181 LI.getSyncScopeID()); 1182 return {nullptr, nullptr}; 1183 } 1184 1185 PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { 1186 if (!isSplitFatPtr(SI.getPointerOperandType())) 1187 return {nullptr, nullptr}; 1188 Value *Arg = SI.getValueOperand(); 1189 handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), 1190 SI.getAlign(), SI.getOrdering(), SI.isVolatile(), 1191 SI.getSyncScopeID()); 1192 return {nullptr, nullptr}; 1193 } 1194 1195 PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { 1196 if (!isSplitFatPtr(AI.getPointerOperand()->getType())) 1197 return {nullptr, nullptr}; 1198 Value *Arg = AI.getValOperand(); 1199 handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), 1200 AI.getAlign(), AI.getOrdering(), AI.isVolatile(), 1201 AI.getSyncScopeID()); 1202 return {nullptr, nullptr}; 1203 } 1204 1205 // Unlike load, store, and RMW, cmpxchg needs special handling to account 1206 // for the boolean argument. 1207 PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { 1208 Value *Ptr = AI.getPointerOperand(); 1209 if (!isSplitFatPtr(Ptr->getType())) 1210 return {nullptr, nullptr}; 1211 IRB.SetInsertPoint(&AI); 1212 1213 Type *Ty = AI.getNewValOperand()->getType(); 1214 AtomicOrdering Order = AI.getMergedOrdering(); 1215 SyncScope::ID SSID = AI.getSyncScopeID(); 1216 bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); 1217 1218 auto [Rsrc, Off] = getPtrParts(Ptr); 1219 insertPreMemOpFence(Order, SSID); 1220 1221 uint32_t Aux = 0; 1222 if (IsNonTemporal) 1223 Aux |= AMDGPU::CPol::SLC; 1224 if (AI.isVolatile()) 1225 Aux |= AMDGPU::CPol::VOLATILE; 1226 auto *Call = 1227 IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, 1228 {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, 1229 Off, IRB.getInt32(0), IRB.getInt32(Aux)}); 1230 copyMetadata(Call, &AI); 1231 setAlign(Call, AI.getAlign(), 2); 1232 Call->takeName(&AI); 1233 insertPostMemOpFence(Order, SSID); 1234 1235 Value *Res = PoisonValue::get(AI.getType()); 1236 Res = IRB.CreateInsertValue(Res, Call, 0); 1237 if (!AI.isWeak()) { 1238 Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); 1239 Res = IRB.CreateInsertValue(Res, Succeeded, 1); 1240 } 1241 SplitUsers.insert(&AI); 1242 AI.replaceAllUsesWith(Res); 1243 return {nullptr, nullptr}; 1244 } 1245 1246 PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { 1247 using namespace llvm::PatternMatch; 1248 Value *Ptr = GEP.getPointerOperand(); 1249 if (!isSplitFatPtr(Ptr->getType())) 1250 return {nullptr, nullptr}; 1251 IRB.SetInsertPoint(&GEP); 1252 1253 auto [Rsrc, Off] = getPtrParts(Ptr); 1254 const DataLayout &DL = GEP.getModule()->getDataLayout(); 1255 bool InBounds = GEP.isInBounds(); 1256 1257 // In order to call emitGEPOffset() and thus not have to reimplement it, 1258 // we need the GEP result to have ptr addrspace(7) type. 1259 Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); 1260 if (auto *VT = dyn_cast<VectorType>(Off->getType())) 1261 FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); 1262 GEP.mutateType(FatPtrTy); 1263 Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); 1264 GEP.mutateType(Ptr->getType()); 1265 if (match(OffAccum, m_Zero())) { // Constant-zero offset 1266 SplitUsers.insert(&GEP); 1267 return {Rsrc, Off}; 1268 } 1269 1270 bool HasNonNegativeOff = false; 1271 if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { 1272 HasNonNegativeOff = !CI->isNegative(); 1273 } 1274 Value *NewOff; 1275 if (match(Off, m_Zero())) { 1276 NewOff = OffAccum; 1277 } else { 1278 NewOff = IRB.CreateAdd(Off, OffAccum, "", 1279 /*hasNUW=*/InBounds && HasNonNegativeOff, 1280 /*hasNSW=*/false); 1281 } 1282 copyMetadata(NewOff, &GEP); 1283 NewOff->takeName(&GEP); 1284 SplitUsers.insert(&GEP); 1285 return {Rsrc, NewOff}; 1286 } 1287 1288 PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { 1289 Value *Ptr = PI.getPointerOperand(); 1290 if (!isSplitFatPtr(Ptr->getType())) 1291 return {nullptr, nullptr}; 1292 IRB.SetInsertPoint(&PI); 1293 1294 Type *ResTy = PI.getType(); 1295 unsigned Width = ResTy->getScalarSizeInBits(); 1296 1297 auto [Rsrc, Off] = getPtrParts(Ptr); 1298 const DataLayout &DL = PI.getModule()->getDataLayout(); 1299 unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); 1300 1301 Value *Res; 1302 if (Width <= BufferOffsetWidth) { 1303 Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, 1304 PI.getName() + ".off"); 1305 } else { 1306 Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); 1307 Value *Shl = IRB.CreateShl( 1308 RsrcInt, 1309 ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), 1310 "", Width >= FatPtrWidth, Width > FatPtrWidth); 1311 Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, 1312 PI.getName() + ".off"); 1313 Res = IRB.CreateOr(Shl, OffCast); 1314 } 1315 1316 copyMetadata(Res, &PI); 1317 Res->takeName(&PI); 1318 SplitUsers.insert(&PI); 1319 PI.replaceAllUsesWith(Res); 1320 return {nullptr, nullptr}; 1321 } 1322 1323 PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { 1324 if (!isSplitFatPtr(IP.getType())) 1325 return {nullptr, nullptr}; 1326 IRB.SetInsertPoint(&IP); 1327 const DataLayout &DL = IP.getModule()->getDataLayout(); 1328 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); 1329 Value *Int = IP.getOperand(0); 1330 Type *IntTy = Int->getType(); 1331 Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); 1332 unsigned Width = IntTy->getScalarSizeInBits(); 1333 1334 auto *RetTy = cast<StructType>(IP.getType()); 1335 Type *RsrcTy = RetTy->getElementType(0); 1336 Type *OffTy = RetTy->getElementType(1); 1337 Value *RsrcPart = IRB.CreateLShr( 1338 Int, 1339 ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); 1340 Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); 1341 Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); 1342 Value *Off = 1343 IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); 1344 1345 copyMetadata(Rsrc, &IP); 1346 SplitUsers.insert(&IP); 1347 return {Rsrc, Off}; 1348 } 1349 1350 PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { 1351 if (!isSplitFatPtr(I.getType())) 1352 return {nullptr, nullptr}; 1353 IRB.SetInsertPoint(&I); 1354 Value *In = I.getPointerOperand(); 1355 // No-op casts preserve parts 1356 if (In->getType() == I.getType()) { 1357 auto [Rsrc, Off] = getPtrParts(In); 1358 SplitUsers.insert(&I); 1359 return {Rsrc, Off}; 1360 } 1361 if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) 1362 report_fatal_error("Only buffer resources (addrspace 8) can be cast to " 1363 "buffer fat pointers (addrspace 7)"); 1364 Type *OffTy = cast<StructType>(I.getType())->getElementType(1); 1365 Value *ZeroOff = Constant::getNullValue(OffTy); 1366 SplitUsers.insert(&I); 1367 return {In, ZeroOff}; 1368 } 1369 1370 PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { 1371 Value *Lhs = Cmp.getOperand(0); 1372 if (!isSplitFatPtr(Lhs->getType())) 1373 return {nullptr, nullptr}; 1374 Value *Rhs = Cmp.getOperand(1); 1375 IRB.SetInsertPoint(&Cmp); 1376 ICmpInst::Predicate Pred = Cmp.getPredicate(); 1377 1378 assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && 1379 "Pointer comparison is only equal or unequal"); 1380 auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); 1381 auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); 1382 Value *RsrcCmp = 1383 IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); 1384 copyMetadata(RsrcCmp, &Cmp); 1385 Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); 1386 copyMetadata(OffCmp, &Cmp); 1387 1388 Value *Res = nullptr; 1389 if (Pred == ICmpInst::ICMP_EQ) 1390 Res = IRB.CreateAnd(RsrcCmp, OffCmp); 1391 else if (Pred == ICmpInst::ICMP_NE) 1392 Res = IRB.CreateOr(RsrcCmp, OffCmp); 1393 copyMetadata(Res, &Cmp); 1394 Res->takeName(&Cmp); 1395 SplitUsers.insert(&Cmp); 1396 Cmp.replaceAllUsesWith(Res); 1397 return {nullptr, nullptr}; 1398 } 1399 1400 PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { 1401 if (!isSplitFatPtr(I.getType())) 1402 return {nullptr, nullptr}; 1403 IRB.SetInsertPoint(&I); 1404 auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); 1405 1406 Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); 1407 copyMetadata(RsrcRes, &I); 1408 Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); 1409 copyMetadata(OffRes, &I); 1410 SplitUsers.insert(&I); 1411 return {RsrcRes, OffRes}; 1412 } 1413 1414 PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { 1415 if (!isSplitFatPtr(I.getType())) 1416 return {nullptr, nullptr}; 1417 IRB.SetInsertPoint(&I); 1418 Value *Vec = I.getVectorOperand(); 1419 Value *Idx = I.getIndexOperand(); 1420 auto [Rsrc, Off] = getPtrParts(Vec); 1421 1422 Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); 1423 copyMetadata(RsrcRes, &I); 1424 Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); 1425 copyMetadata(OffRes, &I); 1426 SplitUsers.insert(&I); 1427 return {RsrcRes, OffRes}; 1428 } 1429 1430 PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { 1431 // The mutated instructions temporarily don't return vectors, and so 1432 // we need the generic getType() here to avoid crashes. 1433 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1434 return {nullptr, nullptr}; 1435 IRB.SetInsertPoint(&I); 1436 Value *Vec = I.getOperand(0); 1437 Value *Elem = I.getOperand(1); 1438 Value *Idx = I.getOperand(2); 1439 auto [VecRsrc, VecOff] = getPtrParts(Vec); 1440 auto [ElemRsrc, ElemOff] = getPtrParts(Elem); 1441 1442 Value *RsrcRes = 1443 IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); 1444 copyMetadata(RsrcRes, &I); 1445 Value *OffRes = 1446 IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); 1447 copyMetadata(OffRes, &I); 1448 SplitUsers.insert(&I); 1449 return {RsrcRes, OffRes}; 1450 } 1451 1452 PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { 1453 // Cast is needed for the same reason as insertelement's. 1454 if (!isSplitFatPtr(cast<Instruction>(I).getType())) 1455 return {nullptr, nullptr}; 1456 IRB.SetInsertPoint(&I); 1457 1458 Value *V1 = I.getOperand(0); 1459 Value *V2 = I.getOperand(1); 1460 ArrayRef<int> Mask = I.getShuffleMask(); 1461 auto [V1Rsrc, V1Off] = getPtrParts(V1); 1462 auto [V2Rsrc, V2Off] = getPtrParts(V2); 1463 1464 Value *RsrcRes = 1465 IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); 1466 copyMetadata(RsrcRes, &I); 1467 Value *OffRes = 1468 IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); 1469 copyMetadata(OffRes, &I); 1470 SplitUsers.insert(&I); 1471 return {RsrcRes, OffRes}; 1472 } 1473 1474 PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { 1475 if (!isSplitFatPtr(PHI.getType())) 1476 return {nullptr, nullptr}; 1477 IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef()); 1478 // Phi nodes will be handled in post-processing after we've visited every 1479 // instruction. However, instead of just returning {nullptr, nullptr}, 1480 // we explicitly create the temporary extractvalue operations that are our 1481 // temporary results so that they end up at the beginning of the block with 1482 // the PHIs. 1483 Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); 1484 Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); 1485 Conditionals.push_back(&PHI); 1486 SplitUsers.insert(&PHI); 1487 return {TmpRsrc, TmpOff}; 1488 } 1489 1490 PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { 1491 if (!isSplitFatPtr(SI.getType())) 1492 return {nullptr, nullptr}; 1493 IRB.SetInsertPoint(&SI); 1494 1495 Value *Cond = SI.getCondition(); 1496 Value *True = SI.getTrueValue(); 1497 Value *False = SI.getFalseValue(); 1498 auto [TrueRsrc, TrueOff] = getPtrParts(True); 1499 auto [FalseRsrc, FalseOff] = getPtrParts(False); 1500 1501 Value *RsrcRes = 1502 IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); 1503 copyMetadata(RsrcRes, &SI); 1504 Conditionals.push_back(&SI); 1505 Value *OffRes = 1506 IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); 1507 copyMetadata(OffRes, &SI); 1508 SplitUsers.insert(&SI); 1509 return {RsrcRes, OffRes}; 1510 } 1511 1512 /// Returns true if this intrinsic needs to be removed when it is 1513 /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are 1514 /// rewritten into calls to versions of that intrinsic on the resource 1515 /// descriptor. 1516 static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { 1517 switch (IID) { 1518 default: 1519 return false; 1520 case Intrinsic::ptrmask: 1521 case Intrinsic::invariant_start: 1522 case Intrinsic::invariant_end: 1523 case Intrinsic::launder_invariant_group: 1524 case Intrinsic::strip_invariant_group: 1525 return true; 1526 } 1527 } 1528 1529 PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { 1530 Intrinsic::ID IID = I.getIntrinsicID(); 1531 switch (IID) { 1532 default: 1533 break; 1534 case Intrinsic::ptrmask: { 1535 Value *Ptr = I.getArgOperand(0); 1536 if (!isSplitFatPtr(Ptr->getType())) 1537 return {nullptr, nullptr}; 1538 Value *Mask = I.getArgOperand(1); 1539 IRB.SetInsertPoint(&I); 1540 auto [Rsrc, Off] = getPtrParts(Ptr); 1541 if (Mask->getType() != Off->getType()) 1542 report_fatal_error("offset width is not equal to index width of fat " 1543 "pointer (data layout not set up correctly?)"); 1544 Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off"); 1545 copyMetadata(OffRes, &I); 1546 SplitUsers.insert(&I); 1547 return {Rsrc, OffRes}; 1548 } 1549 // Pointer annotation intrinsics that, given their object-wide nature 1550 // operate on the resource part. 1551 case Intrinsic::invariant_start: { 1552 Value *Ptr = I.getArgOperand(1); 1553 if (!isSplitFatPtr(Ptr->getType())) 1554 return {nullptr, nullptr}; 1555 IRB.SetInsertPoint(&I); 1556 auto [Rsrc, Off] = getPtrParts(Ptr); 1557 Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); 1558 auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); 1559 copyMetadata(NewRsrc, &I); 1560 NewRsrc->takeName(&I); 1561 SplitUsers.insert(&I); 1562 I.replaceAllUsesWith(NewRsrc); 1563 return {nullptr, nullptr}; 1564 } 1565 case Intrinsic::invariant_end: { 1566 Value *RealPtr = I.getArgOperand(2); 1567 if (!isSplitFatPtr(RealPtr->getType())) 1568 return {nullptr, nullptr}; 1569 IRB.SetInsertPoint(&I); 1570 Value *RealRsrc = getPtrParts(RealPtr).first; 1571 Value *InvPtr = I.getArgOperand(0); 1572 Value *Size = I.getArgOperand(1); 1573 Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, 1574 {InvPtr, Size, RealRsrc}); 1575 copyMetadata(NewRsrc, &I); 1576 NewRsrc->takeName(&I); 1577 SplitUsers.insert(&I); 1578 I.replaceAllUsesWith(NewRsrc); 1579 return {nullptr, nullptr}; 1580 } 1581 case Intrinsic::launder_invariant_group: 1582 case Intrinsic::strip_invariant_group: { 1583 Value *Ptr = I.getArgOperand(0); 1584 if (!isSplitFatPtr(Ptr->getType())) 1585 return {nullptr, nullptr}; 1586 IRB.SetInsertPoint(&I); 1587 auto [Rsrc, Off] = getPtrParts(Ptr); 1588 Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); 1589 copyMetadata(NewRsrc, &I); 1590 NewRsrc->takeName(&I); 1591 SplitUsers.insert(&I); 1592 return {NewRsrc, Off}; 1593 } 1594 } 1595 return {nullptr, nullptr}; 1596 } 1597 1598 void SplitPtrStructs::processFunction(Function &F) { 1599 ST = &TM->getSubtarget<GCNSubtarget>(F); 1600 SmallVector<Instruction *, 0> Originals; 1601 LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName() 1602 << "\n"); 1603 for (Instruction &I : instructions(F)) 1604 Originals.push_back(&I); 1605 for (Instruction *I : Originals) { 1606 auto [Rsrc, Off] = visit(I); 1607 assert(((Rsrc && Off) || (!Rsrc && !Off)) && 1608 "Can't have a resource but no offset"); 1609 if (Rsrc) 1610 RsrcParts[I] = Rsrc; 1611 if (Off) 1612 OffParts[I] = Off; 1613 } 1614 processConditionals(); 1615 killAndReplaceSplitInstructions(Originals); 1616 1617 // Clean up after ourselves to save on memory. 1618 RsrcParts.clear(); 1619 OffParts.clear(); 1620 SplitUsers.clear(); 1621 Conditionals.clear(); 1622 ConditionalTemps.clear(); 1623 } 1624 1625 namespace { 1626 class AMDGPULowerBufferFatPointers : public ModulePass { 1627 public: 1628 static char ID; 1629 1630 AMDGPULowerBufferFatPointers() : ModulePass(ID) { 1631 initializeAMDGPULowerBufferFatPointersPass( 1632 *PassRegistry::getPassRegistry()); 1633 } 1634 1635 bool run(Module &M, const TargetMachine &TM); 1636 bool runOnModule(Module &M) override; 1637 1638 void getAnalysisUsage(AnalysisUsage &AU) const override; 1639 }; 1640 } // namespace 1641 1642 /// Returns true if there are values that have a buffer fat pointer in them, 1643 /// which means we'll need to perform rewrites on this function. As a side 1644 /// effect, this will populate the type remapping cache. 1645 static bool containsBufferFatPointers(const Function &F, 1646 BufferFatPtrToStructTypeMap *TypeMap) { 1647 bool HasFatPointers = false; 1648 for (const BasicBlock &BB : F) 1649 for (const Instruction &I : BB) 1650 HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); 1651 return HasFatPointers; 1652 } 1653 1654 static bool hasFatPointerInterface(const Function &F, 1655 BufferFatPtrToStructTypeMap *TypeMap) { 1656 Type *Ty = F.getFunctionType(); 1657 return Ty != TypeMap->remapType(Ty); 1658 } 1659 1660 /// Move the body of `OldF` into a new function, returning it. 1661 static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, 1662 ValueToValueMapTy &CloneMap) { 1663 bool IsIntrinsic = OldF->isIntrinsic(); 1664 Function *NewF = 1665 Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); 1666 NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat; 1667 NewF->copyAttributesFrom(OldF); 1668 NewF->copyMetadata(OldF, 0); 1669 NewF->takeName(OldF); 1670 NewF->updateAfterNameChange(); 1671 NewF->setDLLStorageClass(OldF->getDLLStorageClass()); 1672 OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); 1673 1674 while (!OldF->empty()) { 1675 BasicBlock *BB = &OldF->front(); 1676 BB->removeFromParent(); 1677 BB->insertInto(NewF); 1678 CloneMap[BB] = BB; 1679 for (Instruction &I : *BB) { 1680 CloneMap[&I] = &I; 1681 } 1682 } 1683 1684 AttributeMask PtrOnlyAttrs; 1685 for (auto K : 1686 {Attribute::Dereferenceable, Attribute::DereferenceableOrNull, 1687 Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree, 1688 Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone, 1689 Attribute::ReadOnly, Attribute::WriteOnly}) { 1690 PtrOnlyAttrs.addAttribute(K); 1691 } 1692 SmallVector<AttributeSet> ArgAttrs; 1693 AttributeList OldAttrs = OldF->getAttributes(); 1694 1695 for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) { 1696 CloneMap[&NewArg] = &OldArg; 1697 NewArg.takeName(&OldArg); 1698 Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); 1699 // Temporarily mutate type of `NewArg` to allow RAUW to work. 1700 NewArg.mutateType(OldArgTy); 1701 OldArg.replaceAllUsesWith(&NewArg); 1702 NewArg.mutateType(NewArgTy); 1703 1704 AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); 1705 // Intrinsics get their attributes fixed later. 1706 if (OldArgTy != NewArgTy && !IsIntrinsic) 1707 ArgAttr = ArgAttr.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1708 ArgAttrs.push_back(ArgAttr); 1709 } 1710 AttributeSet RetAttrs = OldAttrs.getRetAttrs(); 1711 if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) 1712 RetAttrs = RetAttrs.removeAttributes(NewF->getContext(), PtrOnlyAttrs); 1713 NewF->setAttributes(AttributeList::get( 1714 NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); 1715 return NewF; 1716 } 1717 1718 static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { 1719 for (Argument &A : F->args()) 1720 CloneMap[&A] = &A; 1721 for (BasicBlock &BB : *F) { 1722 CloneMap[&BB] = &BB; 1723 for (Instruction &I : BB) 1724 CloneMap[&I] = &I; 1725 } 1726 } 1727 1728 bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { 1729 bool Changed = false; 1730 const DataLayout &DL = M.getDataLayout(); 1731 // Record the functions which need to be remapped. 1732 // The second element of the pair indicates whether the function has to have 1733 // its arguments or return types adjusted. 1734 SmallVector<std::pair<Function *, bool>> NeedsRemap; 1735 1736 BufferFatPtrToStructTypeMap StructTM(DL); 1737 BufferFatPtrToIntTypeMap IntTM(DL); 1738 for (const GlobalVariable &GV : M.globals()) { 1739 if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) 1740 report_fatal_error("Global variables with a buffer fat pointer address " 1741 "space (7) are not supported"); 1742 Type *VT = GV.getValueType(); 1743 if (VT != StructTM.remapType(VT)) 1744 report_fatal_error("Global variables that contain buffer fat pointers " 1745 "(address space 7 pointers) are unsupported. Use " 1746 "buffer resource pointers (address space 8) instead."); 1747 } 1748 1749 { 1750 // Collect all constant exprs and aggregates referenced by any function. 1751 SmallVector<Constant *, 8> Worklist; 1752 for (Function &F : M.functions()) 1753 for (Instruction &I : instructions(F)) 1754 for (Value *Op : I.operands()) 1755 if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) 1756 Worklist.push_back(cast<Constant>(Op)); 1757 1758 // Recursively look for any referenced buffer pointer constants. 1759 SmallPtrSet<Constant *, 8> Visited; 1760 SetVector<Constant *> BufferFatPtrConsts; 1761 while (!Worklist.empty()) { 1762 Constant *C = Worklist.pop_back_val(); 1763 if (!Visited.insert(C).second) 1764 continue; 1765 if (isBufferFatPtrOrVector(C->getType())) 1766 BufferFatPtrConsts.insert(C); 1767 for (Value *Op : C->operands()) 1768 if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) 1769 Worklist.push_back(cast<Constant>(Op)); 1770 } 1771 1772 // Expand all constant expressions using fat buffer pointers to 1773 // instructions. 1774 Changed |= convertUsersOfConstantsToInstructions( 1775 BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr, 1776 /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); 1777 } 1778 1779 StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); 1780 for (Function &F : M.functions()) { 1781 bool InterfaceChange = hasFatPointerInterface(F, &StructTM); 1782 bool BodyChanges = containsBufferFatPointers(F, &StructTM); 1783 Changed |= MemOpsRewrite.processFunction(F); 1784 if (InterfaceChange || BodyChanges) 1785 NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); 1786 } 1787 if (NeedsRemap.empty()) 1788 return Changed; 1789 1790 SmallVector<Function *> NeedsPostProcess; 1791 SmallVector<Function *> Intrinsics; 1792 // Keep one big map so as to memoize constants across functions. 1793 ValueToValueMapTy CloneMap; 1794 FatPtrConstMaterializer Materializer(&StructTM, CloneMap); 1795 1796 ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); 1797 for (auto [F, InterfaceChange] : NeedsRemap) { 1798 Function *NewF = F; 1799 if (InterfaceChange) 1800 NewF = moveFunctionAdaptingType( 1801 F, cast<FunctionType>(StructTM.remapType(F->getFunctionType())), 1802 CloneMap); 1803 else 1804 makeCloneInPraceMap(F, CloneMap); 1805 LowerInFuncs.remapFunction(*NewF); 1806 if (NewF->isIntrinsic()) 1807 Intrinsics.push_back(NewF); 1808 else 1809 NeedsPostProcess.push_back(NewF); 1810 if (InterfaceChange) { 1811 F->replaceAllUsesWith(NewF); 1812 F->eraseFromParent(); 1813 } 1814 Changed = true; 1815 } 1816 StructTM.clear(); 1817 IntTM.clear(); 1818 CloneMap.clear(); 1819 1820 SplitPtrStructs Splitter(M.getContext(), &TM); 1821 for (Function *F : NeedsPostProcess) 1822 Splitter.processFunction(*F); 1823 for (Function *F : Intrinsics) { 1824 if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { 1825 F->eraseFromParent(); 1826 } else { 1827 std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F); 1828 if (NewF) 1829 F->replaceAllUsesWith(*NewF); 1830 } 1831 } 1832 return Changed; 1833 } 1834 1835 bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { 1836 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 1837 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 1838 return run(M, TM); 1839 } 1840 1841 char AMDGPULowerBufferFatPointers::ID = 0; 1842 1843 char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; 1844 1845 void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { 1846 AU.addRequired<TargetPassConfig>(); 1847 } 1848 1849 #define PASS_DESC "Lower buffer fat pointer operations to buffer resources" 1850 INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, 1851 false, false) 1852 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 1853 INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, 1854 false) 1855 #undef PASS_DESC 1856 1857 ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { 1858 return new AMDGPULowerBufferFatPointers(); 1859 } 1860 1861 PreservedAnalyses 1862 AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { 1863 return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() 1864 : PreservedAnalyses::all(); 1865 } 1866