1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // This file contains classes used to discover if for a particular value 9 // there from sue to definition that crosses a suspend block. 10 // 11 // Using the information discovered we form a Coroutine Frame structure to 12 // contain those values. All uses of those values are replaced with appropriate 13 // GEP + load from the coroutine frame. At the point of the definition we spill 14 // the value into the coroutine frame. 15 //===----------------------------------------------------------------------===// 16 17 #include "CoroInternal.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/ADT/SmallString.h" 22 #include "llvm/Analysis/CFG.h" 23 #include "llvm/Analysis/PtrUseVisitor.h" 24 #include "llvm/Analysis/StackLifetime.h" 25 #include "llvm/Config/llvm-config.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DIBuilder.h" 28 #include "llvm/IR/DebugInfo.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstIterator.h" 32 #include "llvm/IR/IntrinsicInst.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/MathExtras.h" 35 #include "llvm/Support/OptimizedStructLayout.h" 36 #include "llvm/Support/circular_raw_ostream.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include "llvm/Transforms/Utils/Local.h" 40 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 41 #include <algorithm> 42 #include <deque> 43 #include <optional> 44 45 using namespace llvm; 46 47 extern cl::opt<bool> UseNewDbgInfoFormat; 48 49 // The "coro-suspend-crossing" flag is very noisy. There is another debug type, 50 // "coro-frame", which results in leaner debug spew. 51 #define DEBUG_TYPE "coro-suspend-crossing" 52 53 enum { SmallVectorThreshold = 32 }; 54 55 // Provides two way mapping between the blocks and numbers. 56 namespace { 57 class BlockToIndexMapping { 58 SmallVector<BasicBlock *, SmallVectorThreshold> V; 59 60 public: 61 size_t size() const { return V.size(); } 62 63 BlockToIndexMapping(Function &F) { 64 for (BasicBlock &BB : F) 65 V.push_back(&BB); 66 llvm::sort(V); 67 } 68 69 size_t blockToIndex(BasicBlock const *BB) const { 70 auto *I = llvm::lower_bound(V, BB); 71 assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block"); 72 return I - V.begin(); 73 } 74 75 BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; } 76 }; 77 } // end anonymous namespace 78 79 // The SuspendCrossingInfo maintains data that allows to answer a question 80 // whether given two BasicBlocks A and B there is a path from A to B that 81 // passes through a suspend point. 82 // 83 // For every basic block 'i' it maintains a BlockData that consists of: 84 // Consumes: a bit vector which contains a set of indices of blocks that can 85 // reach block 'i'. A block can trivially reach itself. 86 // Kills: a bit vector which contains a set of indices of blocks that can 87 // reach block 'i' but there is a path crossing a suspend point 88 // not repeating 'i' (path to 'i' without cycles containing 'i'). 89 // Suspend: a boolean indicating whether block 'i' contains a suspend point. 90 // End: a boolean indicating whether block 'i' contains a coro.end intrinsic. 91 // KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that 92 // crosses a suspend point. 93 // 94 namespace { 95 class SuspendCrossingInfo { 96 BlockToIndexMapping Mapping; 97 98 struct BlockData { 99 BitVector Consumes; 100 BitVector Kills; 101 bool Suspend = false; 102 bool End = false; 103 bool KillLoop = false; 104 bool Changed = false; 105 }; 106 SmallVector<BlockData, SmallVectorThreshold> Block; 107 108 iterator_range<pred_iterator> predecessors(BlockData const &BD) const { 109 BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); 110 return llvm::predecessors(BB); 111 } 112 113 BlockData &getBlockData(BasicBlock *BB) { 114 return Block[Mapping.blockToIndex(BB)]; 115 } 116 117 /// Compute the BlockData for the current function in one iteration. 118 /// Initialize - Whether this is the first iteration, we can optimize 119 /// the initial case a little bit by manual loop switch. 120 /// Returns whether the BlockData changes in this iteration. 121 template <bool Initialize = false> 122 bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT); 123 124 public: 125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 126 void dump() const; 127 void dump(StringRef Label, BitVector const &BV) const; 128 #endif 129 130 SuspendCrossingInfo(Function &F, coro::Shape &Shape); 131 132 /// Returns true if there is a path from \p From to \p To crossing a suspend 133 /// point without crossing \p From a 2nd time. 134 bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const { 135 size_t const FromIndex = Mapping.blockToIndex(From); 136 size_t const ToIndex = Mapping.blockToIndex(To); 137 bool const Result = Block[ToIndex].Kills[FromIndex]; 138 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 139 << " answer is " << Result << "\n"); 140 return Result; 141 } 142 143 /// Returns true if there is a path from \p From to \p To crossing a suspend 144 /// point without crossing \p From a 2nd time. If \p From is the same as \p To 145 /// this will also check if there is a looping path crossing a suspend point. 146 bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From, 147 BasicBlock *To) const { 148 size_t const FromIndex = Mapping.blockToIndex(From); 149 size_t const ToIndex = Mapping.blockToIndex(To); 150 bool Result = Block[ToIndex].Kills[FromIndex] || 151 (From == To && Block[ToIndex].KillLoop); 152 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 153 << " answer is " << Result << " (path or loop)\n"); 154 return Result; 155 } 156 157 bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const { 158 auto *I = cast<Instruction>(U); 159 160 // We rewrote PHINodes, so that only the ones with exactly one incoming 161 // value need to be analyzed. 162 if (auto *PN = dyn_cast<PHINode>(I)) 163 if (PN->getNumIncomingValues() > 1) 164 return false; 165 166 BasicBlock *UseBB = I->getParent(); 167 168 // As a special case, treat uses by an llvm.coro.suspend.retcon or an 169 // llvm.coro.suspend.async as if they were uses in the suspend's single 170 // predecessor: the uses conceptually occur before the suspend. 171 if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) { 172 UseBB = UseBB->getSinglePredecessor(); 173 assert(UseBB && "should have split coro.suspend into its own block"); 174 } 175 176 return hasPathCrossingSuspendPoint(DefBB, UseBB); 177 } 178 179 bool isDefinitionAcrossSuspend(Argument &A, User *U) const { 180 return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U); 181 } 182 183 bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { 184 auto *DefBB = I.getParent(); 185 186 // As a special case, treat values produced by an llvm.coro.suspend.* 187 // as if they were defined in the single successor: the uses 188 // conceptually occur after the suspend. 189 if (isa<AnyCoroSuspendInst>(I)) { 190 DefBB = DefBB->getSingleSuccessor(); 191 assert(DefBB && "should have split coro.suspend into its own block"); 192 } 193 194 return isDefinitionAcrossSuspend(DefBB, U); 195 } 196 197 bool isDefinitionAcrossSuspend(Value &V, User *U) const { 198 if (auto *Arg = dyn_cast<Argument>(&V)) 199 return isDefinitionAcrossSuspend(*Arg, U); 200 if (auto *Inst = dyn_cast<Instruction>(&V)) 201 return isDefinitionAcrossSuspend(*Inst, U); 202 203 llvm_unreachable( 204 "Coroutine could only collect Argument and Instruction now."); 205 } 206 }; 207 } // end anonymous namespace 208 209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 210 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label, 211 BitVector const &BV) const { 212 dbgs() << Label << ":"; 213 for (size_t I = 0, N = BV.size(); I < N; ++I) 214 if (BV[I]) 215 dbgs() << " " << Mapping.indexToBlock(I)->getName(); 216 dbgs() << "\n"; 217 } 218 219 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { 220 for (size_t I = 0, N = Block.size(); I < N; ++I) { 221 BasicBlock *const B = Mapping.indexToBlock(I); 222 dbgs() << B->getName() << ":\n"; 223 dump(" Consumes", Block[I].Consumes); 224 dump(" Kills", Block[I].Kills); 225 } 226 dbgs() << "\n"; 227 } 228 #endif 229 230 template <bool Initialize> 231 bool SuspendCrossingInfo::computeBlockData( 232 const ReversePostOrderTraversal<Function *> &RPOT) { 233 bool Changed = false; 234 235 for (const BasicBlock *BB : RPOT) { 236 auto BBNo = Mapping.blockToIndex(BB); 237 auto &B = Block[BBNo]; 238 239 // We don't need to count the predecessors when initialization. 240 if constexpr (!Initialize) 241 // If all the predecessors of the current Block don't change, 242 // the BlockData for the current block must not change too. 243 if (all_of(predecessors(B), [this](BasicBlock *BB) { 244 return !Block[Mapping.blockToIndex(BB)].Changed; 245 })) { 246 B.Changed = false; 247 continue; 248 } 249 250 // Saved Consumes and Kills bitsets so that it is easy to see 251 // if anything changed after propagation. 252 auto SavedConsumes = B.Consumes; 253 auto SavedKills = B.Kills; 254 255 for (BasicBlock *PI : predecessors(B)) { 256 auto PrevNo = Mapping.blockToIndex(PI); 257 auto &P = Block[PrevNo]; 258 259 // Propagate Kills and Consumes from predecessors into B. 260 B.Consumes |= P.Consumes; 261 B.Kills |= P.Kills; 262 263 // If block P is a suspend block, it should propagate kills into block 264 // B for every block P consumes. 265 if (P.Suspend) 266 B.Kills |= P.Consumes; 267 } 268 269 if (B.Suspend) { 270 // If block B is a suspend block, it should kill all of the blocks it 271 // consumes. 272 B.Kills |= B.Consumes; 273 } else if (B.End) { 274 // If block B is an end block, it should not propagate kills as the 275 // blocks following coro.end() are reached during initial invocation 276 // of the coroutine while all the data are still available on the 277 // stack or in the registers. 278 B.Kills.reset(); 279 } else { 280 // This is reached when B block it not Suspend nor coro.end and it 281 // need to make sure that it is not in the kill set. 282 B.KillLoop |= B.Kills[BBNo]; 283 B.Kills.reset(BBNo); 284 } 285 286 if constexpr (!Initialize) { 287 B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); 288 Changed |= B.Changed; 289 } 290 } 291 292 return Changed; 293 } 294 295 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) 296 : Mapping(F) { 297 const size_t N = Mapping.size(); 298 Block.resize(N); 299 300 // Initialize every block so that it consumes itself 301 for (size_t I = 0; I < N; ++I) { 302 auto &B = Block[I]; 303 B.Consumes.resize(N); 304 B.Kills.resize(N); 305 B.Consumes.set(I); 306 B.Changed = true; 307 } 308 309 // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as 310 // the code beyond coro.end is reachable during initial invocation of the 311 // coroutine. 312 for (auto *CE : Shape.CoroEnds) 313 getBlockData(CE->getParent()).End = true; 314 315 // Mark all suspend blocks and indicate that they kill everything they 316 // consume. Note, that crossing coro.save also requires a spill, as any code 317 // between coro.save and coro.suspend may resume the coroutine and all of the 318 // state needs to be saved by that time. 319 auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { 320 BasicBlock *SuspendBlock = BarrierInst->getParent(); 321 auto &B = getBlockData(SuspendBlock); 322 B.Suspend = true; 323 B.Kills |= B.Consumes; 324 }; 325 for (auto *CSI : Shape.CoroSuspends) { 326 markSuspendBlock(CSI); 327 if (auto *Save = CSI->getCoroSave()) 328 markSuspendBlock(Save); 329 } 330 331 // It is considered to be faster to use RPO traversal for forward-edges 332 // dataflow analysis. 333 ReversePostOrderTraversal<Function *> RPOT(&F); 334 computeBlockData</*Initialize=*/true>(RPOT); 335 while (computeBlockData</*Initialize*/ false>(RPOT)) 336 ; 337 338 LLVM_DEBUG(dump()); 339 } 340 341 namespace { 342 343 // RematGraph is used to construct a DAG for rematerializable instructions 344 // When the constructor is invoked with a candidate instruction (which is 345 // materializable) it builds a DAG of materializable instructions from that 346 // point. 347 // Typically, for each instruction identified as re-materializable across a 348 // suspend point, a RematGraph will be created. 349 struct RematGraph { 350 // Each RematNode in the graph contains the edges to instructions providing 351 // operands in the current node. 352 struct RematNode { 353 Instruction *Node; 354 SmallVector<RematNode *> Operands; 355 RematNode() = default; 356 RematNode(Instruction *V) : Node(V) {} 357 }; 358 359 RematNode *EntryNode; 360 using RematNodeMap = 361 SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; 362 RematNodeMap Remats; 363 const std::function<bool(Instruction &)> &MaterializableCallback; 364 SuspendCrossingInfo &Checker; 365 366 RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, 367 Instruction *I, SuspendCrossingInfo &Checker) 368 : MaterializableCallback(MaterializableCallback), Checker(Checker) { 369 std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); 370 EntryNode = FirstNode.get(); 371 std::deque<std::unique_ptr<RematNode>> WorkList; 372 addNode(std::move(FirstNode), WorkList, cast<User>(I)); 373 while (WorkList.size()) { 374 std::unique_ptr<RematNode> N = std::move(WorkList.front()); 375 WorkList.pop_front(); 376 addNode(std::move(N), WorkList, cast<User>(I)); 377 } 378 } 379 380 void addNode(std::unique_ptr<RematNode> NUPtr, 381 std::deque<std::unique_ptr<RematNode>> &WorkList, 382 User *FirstUse) { 383 RematNode *N = NUPtr.get(); 384 if (Remats.count(N->Node)) 385 return; 386 387 // We haven't see this node yet - add to the list 388 Remats[N->Node] = std::move(NUPtr); 389 for (auto &Def : N->Node->operands()) { 390 Instruction *D = dyn_cast<Instruction>(Def.get()); 391 if (!D || !MaterializableCallback(*D) || 392 !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) 393 continue; 394 395 if (Remats.count(D)) { 396 // Already have this in the graph 397 N->Operands.push_back(Remats[D].get()); 398 continue; 399 } 400 401 bool NoMatch = true; 402 for (auto &I : WorkList) { 403 if (I->Node == D) { 404 NoMatch = false; 405 N->Operands.push_back(I.get()); 406 break; 407 } 408 } 409 if (NoMatch) { 410 // Create a new node 411 std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); 412 N->Operands.push_back(ChildNode.get()); 413 WorkList.push_back(std::move(ChildNode)); 414 } 415 } 416 } 417 418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 419 void dump() const { 420 dbgs() << "Entry ("; 421 if (EntryNode->Node->getParent()->hasName()) 422 dbgs() << EntryNode->Node->getParent()->getName(); 423 else 424 EntryNode->Node->getParent()->printAsOperand(dbgs(), false); 425 dbgs() << ") : " << *EntryNode->Node << "\n"; 426 for (auto &E : Remats) { 427 dbgs() << *(E.first) << "\n"; 428 for (RematNode *U : E.second->Operands) 429 dbgs() << " " << *U->Node << "\n"; 430 } 431 } 432 #endif 433 }; 434 } // end anonymous namespace 435 436 namespace llvm { 437 438 template <> struct GraphTraits<RematGraph *> { 439 using NodeRef = RematGraph::RematNode *; 440 using ChildIteratorType = RematGraph::RematNode **; 441 442 static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } 443 static ChildIteratorType child_begin(NodeRef N) { 444 return N->Operands.begin(); 445 } 446 static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } 447 }; 448 449 } // end namespace llvm 450 451 #undef DEBUG_TYPE // "coro-suspend-crossing" 452 #define DEBUG_TYPE "coro-frame" 453 454 namespace { 455 class FrameTypeBuilder; 456 // Mapping from the to-be-spilled value to all the users that need reload. 457 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>; 458 struct AllocaInfo { 459 AllocaInst *Alloca; 460 DenseMap<Instruction *, std::optional<APInt>> Aliases; 461 bool MayWriteBeforeCoroBegin; 462 AllocaInfo(AllocaInst *Alloca, 463 DenseMap<Instruction *, std::optional<APInt>> Aliases, 464 bool MayWriteBeforeCoroBegin) 465 : Alloca(Alloca), Aliases(std::move(Aliases)), 466 MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} 467 }; 468 struct FrameDataInfo { 469 // All the values (that are not allocas) that needs to be spilled to the 470 // frame. 471 SpillInfo Spills; 472 // Allocas contains all values defined as allocas that need to live in the 473 // frame. 474 SmallVector<AllocaInfo, 8> Allocas; 475 476 SmallVector<Value *, 8> getAllDefs() const { 477 SmallVector<Value *, 8> Defs; 478 for (const auto &P : Spills) 479 Defs.push_back(P.first); 480 for (const auto &A : Allocas) 481 Defs.push_back(A.Alloca); 482 return Defs; 483 } 484 485 uint32_t getFieldIndex(Value *V) const { 486 auto Itr = FieldIndexMap.find(V); 487 assert(Itr != FieldIndexMap.end() && 488 "Value does not have a frame field index"); 489 return Itr->second; 490 } 491 492 void setFieldIndex(Value *V, uint32_t Index) { 493 assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) && 494 "Cannot set the index for the same field twice."); 495 FieldIndexMap[V] = Index; 496 } 497 498 Align getAlign(Value *V) const { 499 auto Iter = FieldAlignMap.find(V); 500 assert(Iter != FieldAlignMap.end()); 501 return Iter->second; 502 } 503 504 void setAlign(Value *V, Align AL) { 505 assert(FieldAlignMap.count(V) == 0); 506 FieldAlignMap.insert({V, AL}); 507 } 508 509 uint64_t getDynamicAlign(Value *V) const { 510 auto Iter = FieldDynamicAlignMap.find(V); 511 assert(Iter != FieldDynamicAlignMap.end()); 512 return Iter->second; 513 } 514 515 void setDynamicAlign(Value *V, uint64_t Align) { 516 assert(FieldDynamicAlignMap.count(V) == 0); 517 FieldDynamicAlignMap.insert({V, Align}); 518 } 519 520 uint64_t getOffset(Value *V) const { 521 auto Iter = FieldOffsetMap.find(V); 522 assert(Iter != FieldOffsetMap.end()); 523 return Iter->second; 524 } 525 526 void setOffset(Value *V, uint64_t Offset) { 527 assert(FieldOffsetMap.count(V) == 0); 528 FieldOffsetMap.insert({V, Offset}); 529 } 530 531 // Remap the index of every field in the frame, using the final layout index. 532 void updateLayoutIndex(FrameTypeBuilder &B); 533 534 private: 535 // LayoutIndexUpdateStarted is used to avoid updating the index of any field 536 // twice by mistake. 537 bool LayoutIndexUpdateStarted = false; 538 // Map from values to their slot indexes on the frame. They will be first set 539 // with their original insertion field index. After the frame is built, their 540 // indexes will be updated into the final layout index. 541 DenseMap<Value *, uint32_t> FieldIndexMap; 542 // Map from values to their alignment on the frame. They would be set after 543 // the frame is built. 544 DenseMap<Value *, Align> FieldAlignMap; 545 DenseMap<Value *, uint64_t> FieldDynamicAlignMap; 546 // Map from values to their offset on the frame. They would be set after 547 // the frame is built. 548 DenseMap<Value *, uint64_t> FieldOffsetMap; 549 }; 550 } // namespace 551 552 #ifndef NDEBUG 553 static void dumpSpills(StringRef Title, const SpillInfo &Spills) { 554 dbgs() << "------------- " << Title << "--------------\n"; 555 for (const auto &E : Spills) { 556 E.first->dump(); 557 dbgs() << " user: "; 558 for (auto *I : E.second) 559 I->dump(); 560 } 561 } 562 static void dumpRemats( 563 StringRef Title, 564 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { 565 dbgs() << "------------- " << Title << "--------------\n"; 566 for (const auto &E : RM) { 567 E.second->dump(); 568 dbgs() << "--\n"; 569 } 570 } 571 572 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { 573 dbgs() << "------------- Allocas --------------\n"; 574 for (const auto &A : Allocas) { 575 A.Alloca->dump(); 576 } 577 } 578 #endif 579 580 namespace { 581 using FieldIDType = size_t; 582 // We cannot rely solely on natural alignment of a type when building a 583 // coroutine frame and if the alignment specified on the Alloca instruction 584 // differs from the natural alignment of the alloca type we will need to insert 585 // padding. 586 class FrameTypeBuilder { 587 private: 588 struct Field { 589 uint64_t Size; 590 uint64_t Offset; 591 Type *Ty; 592 FieldIDType LayoutFieldIndex; 593 Align Alignment; 594 Align TyAlignment; 595 uint64_t DynamicAlignBuffer; 596 }; 597 598 const DataLayout &DL; 599 LLVMContext &Context; 600 uint64_t StructSize = 0; 601 Align StructAlign; 602 bool IsFinished = false; 603 604 std::optional<Align> MaxFrameAlignment; 605 606 SmallVector<Field, 8> Fields; 607 DenseMap<Value*, unsigned> FieldIndexByKey; 608 609 public: 610 FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL, 611 std::optional<Align> MaxFrameAlignment) 612 : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {} 613 614 /// Add a field to this structure for the storage of an `alloca` 615 /// instruction. 616 [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI, 617 bool IsHeader = false) { 618 Type *Ty = AI->getAllocatedType(); 619 620 // Make an array type if this is a static array allocation. 621 if (AI->isArrayAllocation()) { 622 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 623 Ty = ArrayType::get(Ty, CI->getValue().getZExtValue()); 624 else 625 report_fatal_error("Coroutines cannot handle non static allocas yet"); 626 } 627 628 return addField(Ty, AI->getAlign(), IsHeader); 629 } 630 631 /// We want to put the allocas whose lifetime-ranges are not overlapped 632 /// into one slot of coroutine frame. 633 /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566 634 /// 635 /// cppcoro::task<void> alternative_paths(bool cond) { 636 /// if (cond) { 637 /// big_structure a; 638 /// process(a); 639 /// co_await something(); 640 /// } else { 641 /// big_structure b; 642 /// process2(b); 643 /// co_await something(); 644 /// } 645 /// } 646 /// 647 /// We want to put variable a and variable b in the same slot to 648 /// reduce the size of coroutine frame. 649 /// 650 /// This function use StackLifetime algorithm to partition the AllocaInsts in 651 /// Spills to non-overlapped sets in order to put Alloca in the same 652 /// non-overlapped set into the same slot in the Coroutine Frame. Then add 653 /// field for the allocas in the same non-overlapped set by using the largest 654 /// type as the field type. 655 /// 656 /// Side Effects: Because We sort the allocas, the order of allocas in the 657 /// frame may be different with the order in the source code. 658 void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData, 659 coro::Shape &Shape); 660 661 /// Add a field to this structure. 662 [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment, 663 bool IsHeader = false, 664 bool IsSpillOfValue = false) { 665 assert(!IsFinished && "adding fields to a finished builder"); 666 assert(Ty && "must provide a type for a field"); 667 668 // The field size is always the alloc size of the type. 669 uint64_t FieldSize = DL.getTypeAllocSize(Ty); 670 671 // For an alloca with size=0, we don't need to add a field and they 672 // can just point to any index in the frame. Use index 0. 673 if (FieldSize == 0) { 674 return 0; 675 } 676 677 // The field alignment might not be the type alignment, but we need 678 // to remember the type alignment anyway to build the type. 679 // If we are spilling values we don't need to worry about ABI alignment 680 // concerns. 681 Align ABIAlign = DL.getABITypeAlign(Ty); 682 Align TyAlignment = ABIAlign; 683 if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign) 684 TyAlignment = *MaxFrameAlignment; 685 Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment); 686 687 // The field alignment could be bigger than the max frame case, in that case 688 // we request additional storage to be able to dynamically align the 689 // pointer. 690 uint64_t DynamicAlignBuffer = 0; 691 if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) { 692 DynamicAlignBuffer = 693 offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment); 694 FieldAlignment = *MaxFrameAlignment; 695 FieldSize = FieldSize + DynamicAlignBuffer; 696 } 697 698 // Lay out header fields immediately. 699 uint64_t Offset; 700 if (IsHeader) { 701 Offset = alignTo(StructSize, FieldAlignment); 702 StructSize = Offset + FieldSize; 703 704 // Everything else has a flexible offset. 705 } else { 706 Offset = OptimizedStructLayoutField::FlexibleOffset; 707 } 708 709 Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment, 710 DynamicAlignBuffer}); 711 return Fields.size() - 1; 712 } 713 714 /// Finish the layout and set the body on the given type. 715 void finish(StructType *Ty); 716 717 uint64_t getStructSize() const { 718 assert(IsFinished && "not yet finished!"); 719 return StructSize; 720 } 721 722 Align getStructAlign() const { 723 assert(IsFinished && "not yet finished!"); 724 return StructAlign; 725 } 726 727 FieldIDType getLayoutFieldIndex(FieldIDType Id) const { 728 assert(IsFinished && "not yet finished!"); 729 return Fields[Id].LayoutFieldIndex; 730 } 731 732 Field getLayoutField(FieldIDType Id) const { 733 assert(IsFinished && "not yet finished!"); 734 return Fields[Id]; 735 } 736 }; 737 } // namespace 738 739 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) { 740 auto Updater = [&](Value *I) { 741 auto Field = B.getLayoutField(getFieldIndex(I)); 742 setFieldIndex(I, Field.LayoutFieldIndex); 743 setAlign(I, Field.Alignment); 744 uint64_t dynamicAlign = 745 Field.DynamicAlignBuffer 746 ? Field.DynamicAlignBuffer + Field.Alignment.value() 747 : 0; 748 setDynamicAlign(I, dynamicAlign); 749 setOffset(I, Field.Offset); 750 }; 751 LayoutIndexUpdateStarted = true; 752 for (auto &S : Spills) 753 Updater(S.first); 754 for (const auto &A : Allocas) 755 Updater(A.Alloca); 756 LayoutIndexUpdateStarted = false; 757 } 758 759 void FrameTypeBuilder::addFieldForAllocas(const Function &F, 760 FrameDataInfo &FrameData, 761 coro::Shape &Shape) { 762 using AllocaSetType = SmallVector<AllocaInst *, 4>; 763 SmallVector<AllocaSetType, 4> NonOverlapedAllocas; 764 765 // We need to add field for allocas at the end of this function. 766 auto AddFieldForAllocasAtExit = make_scope_exit([&]() { 767 for (auto AllocaList : NonOverlapedAllocas) { 768 auto *LargestAI = *AllocaList.begin(); 769 FieldIDType Id = addFieldForAlloca(LargestAI); 770 for (auto *Alloca : AllocaList) 771 FrameData.setFieldIndex(Alloca, Id); 772 } 773 }); 774 775 if (!Shape.OptimizeFrame) { 776 for (const auto &A : FrameData.Allocas) { 777 AllocaInst *Alloca = A.Alloca; 778 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 779 } 780 return; 781 } 782 783 // Because there are paths from the lifetime.start to coro.end 784 // for each alloca, the liferanges for every alloca is overlaped 785 // in the blocks who contain coro.end and the successor blocks. 786 // So we choose to skip there blocks when we calculate the liferange 787 // for each alloca. It should be reasonable since there shouldn't be uses 788 // in these blocks and the coroutine frame shouldn't be used outside the 789 // coroutine body. 790 // 791 // Note that the user of coro.suspend may not be SwitchInst. However, this 792 // case seems too complex to handle. And it is harmless to skip these 793 // patterns since it just prevend putting the allocas to live in the same 794 // slot. 795 DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest; 796 for (auto *CoroSuspendInst : Shape.CoroSuspends) { 797 for (auto *U : CoroSuspendInst->users()) { 798 if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) { 799 auto *SWI = const_cast<SwitchInst *>(ConstSWI); 800 DefaultSuspendDest[SWI] = SWI->getDefaultDest(); 801 SWI->setDefaultDest(SWI->getSuccessor(1)); 802 } 803 } 804 } 805 806 auto ExtractAllocas = [&]() { 807 AllocaSetType Allocas; 808 Allocas.reserve(FrameData.Allocas.size()); 809 for (const auto &A : FrameData.Allocas) 810 Allocas.push_back(A.Alloca); 811 return Allocas; 812 }; 813 StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(), 814 StackLifetime::LivenessType::May); 815 StackLifetimeAnalyzer.run(); 816 auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) { 817 return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps( 818 StackLifetimeAnalyzer.getLiveRange(AI2)); 819 }; 820 auto GetAllocaSize = [&](const AllocaInfo &A) { 821 std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL); 822 assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n"); 823 assert(!RetSize->isScalable() && "Scalable vectors are not yet supported"); 824 return RetSize->getFixedValue(); 825 }; 826 // Put larger allocas in the front. So the larger allocas have higher 827 // priority to merge, which can save more space potentially. Also each 828 // AllocaSet would be ordered. So we can get the largest Alloca in one 829 // AllocaSet easily. 830 sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) { 831 return GetAllocaSize(Iter1) > GetAllocaSize(Iter2); 832 }); 833 for (const auto &A : FrameData.Allocas) { 834 AllocaInst *Alloca = A.Alloca; 835 bool Merged = false; 836 // Try to find if the Alloca is not inferenced with any existing 837 // NonOverlappedAllocaSet. If it is true, insert the alloca to that 838 // NonOverlappedAllocaSet. 839 for (auto &AllocaSet : NonOverlapedAllocas) { 840 assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n"); 841 bool NoInference = none_of(AllocaSet, [&](auto Iter) { 842 return IsAllocaInferenre(Alloca, Iter); 843 }); 844 // If the alignment of A is multiple of the alignment of B, the address 845 // of A should satisfy the requirement for aligning for B. 846 // 847 // There may be other more fine-grained strategies to handle the alignment 848 // infomation during the merging process. But it seems hard to handle 849 // these strategies and benefit little. 850 bool Alignable = [&]() -> bool { 851 auto *LargestAlloca = *AllocaSet.begin(); 852 return LargestAlloca->getAlign().value() % Alloca->getAlign().value() == 853 0; 854 }(); 855 bool CouldMerge = NoInference && Alignable; 856 if (!CouldMerge) 857 continue; 858 AllocaSet.push_back(Alloca); 859 Merged = true; 860 break; 861 } 862 if (!Merged) { 863 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 864 } 865 } 866 // Recover the default target destination for each Switch statement 867 // reserved. 868 for (auto SwitchAndDefaultDest : DefaultSuspendDest) { 869 SwitchInst *SWI = SwitchAndDefaultDest.first; 870 BasicBlock *DestBB = SwitchAndDefaultDest.second; 871 SWI->setDefaultDest(DestBB); 872 } 873 // This Debug Info could tell us which allocas are merged into one slot. 874 LLVM_DEBUG(for (auto &AllocaSet 875 : NonOverlapedAllocas) { 876 if (AllocaSet.size() > 1) { 877 dbgs() << "In Function:" << F.getName() << "\n"; 878 dbgs() << "Find Union Set " 879 << "\n"; 880 dbgs() << "\tAllocas are \n"; 881 for (auto Alloca : AllocaSet) 882 dbgs() << "\t\t" << *Alloca << "\n"; 883 } 884 }); 885 } 886 887 void FrameTypeBuilder::finish(StructType *Ty) { 888 assert(!IsFinished && "already finished!"); 889 890 // Prepare the optimal-layout field array. 891 // The Id in the layout field is a pointer to our Field for it. 892 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 893 LayoutFields.reserve(Fields.size()); 894 for (auto &Field : Fields) { 895 LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment, 896 Field.Offset); 897 } 898 899 // Perform layout. 900 auto SizeAndAlign = performOptimizedStructLayout(LayoutFields); 901 StructSize = SizeAndAlign.first; 902 StructAlign = SizeAndAlign.second; 903 904 auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & { 905 return *static_cast<Field *>(const_cast<void*>(LayoutField.Id)); 906 }; 907 908 // We need to produce a packed struct type if there's a field whose 909 // assigned offset isn't a multiple of its natural type alignment. 910 bool Packed = [&] { 911 for (auto &LayoutField : LayoutFields) { 912 auto &F = getField(LayoutField); 913 if (!isAligned(F.TyAlignment, LayoutField.Offset)) 914 return true; 915 } 916 return false; 917 }(); 918 919 // Build the struct body. 920 SmallVector<Type*, 16> FieldTypes; 921 FieldTypes.reserve(LayoutFields.size() * 3 / 2); 922 uint64_t LastOffset = 0; 923 for (auto &LayoutField : LayoutFields) { 924 auto &F = getField(LayoutField); 925 926 auto Offset = LayoutField.Offset; 927 928 // Add a padding field if there's a padding gap and we're either 929 // building a packed struct or the padding gap is more than we'd 930 // get from aligning to the field type's natural alignment. 931 assert(Offset >= LastOffset); 932 if (Offset != LastOffset) { 933 if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset) 934 FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context), 935 Offset - LastOffset)); 936 } 937 938 F.Offset = Offset; 939 F.LayoutFieldIndex = FieldTypes.size(); 940 941 FieldTypes.push_back(F.Ty); 942 if (F.DynamicAlignBuffer) { 943 FieldTypes.push_back( 944 ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); 945 } 946 LastOffset = Offset + F.Size; 947 } 948 949 Ty->setBody(FieldTypes, Packed); 950 951 #ifndef NDEBUG 952 // Check that the IR layout matches the offsets we expect. 953 auto Layout = DL.getStructLayout(Ty); 954 for (auto &F : Fields) { 955 assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty); 956 assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset); 957 } 958 #endif 959 960 IsFinished = true; 961 } 962 963 static void cacheDIVar(FrameDataInfo &FrameData, 964 DenseMap<Value *, DILocalVariable *> &DIVarCache) { 965 for (auto *V : FrameData.getAllDefs()) { 966 if (DIVarCache.contains(V)) 967 continue; 968 969 auto CacheIt = [&DIVarCache, V](const auto &Container) { 970 auto *I = llvm::find_if(Container, [](auto *DDI) { 971 return DDI->getExpression()->getNumElements() == 0; 972 }); 973 if (I != Container.end()) 974 DIVarCache.insert({V, (*I)->getVariable()}); 975 }; 976 CacheIt(findDbgDeclares(V)); 977 CacheIt(findDVRDeclares(V)); 978 } 979 } 980 981 /// Create name for Type. It uses MDString to store new created string to 982 /// avoid memory leak. 983 static StringRef solveTypeName(Type *Ty) { 984 if (Ty->isIntegerTy()) { 985 // The longest name in common may be '__int_128', which has 9 bits. 986 SmallString<16> Buffer; 987 raw_svector_ostream OS(Buffer); 988 OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth(); 989 auto *MDName = MDString::get(Ty->getContext(), OS.str()); 990 return MDName->getString(); 991 } 992 993 if (Ty->isFloatingPointTy()) { 994 if (Ty->isFloatTy()) 995 return "__float_"; 996 if (Ty->isDoubleTy()) 997 return "__double_"; 998 return "__floating_type_"; 999 } 1000 1001 if (Ty->isPointerTy()) 1002 return "PointerType"; 1003 1004 if (Ty->isStructTy()) { 1005 if (!cast<StructType>(Ty)->hasName()) 1006 return "__LiteralStructType_"; 1007 1008 auto Name = Ty->getStructName(); 1009 1010 SmallString<16> Buffer(Name); 1011 for (auto &Iter : Buffer) 1012 if (Iter == '.' || Iter == ':') 1013 Iter = '_'; 1014 auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); 1015 return MDName->getString(); 1016 } 1017 1018 return "UnknownType"; 1019 } 1020 1021 static DIType *solveDIType(DIBuilder &Builder, Type *Ty, 1022 const DataLayout &Layout, DIScope *Scope, 1023 unsigned LineNum, 1024 DenseMap<Type *, DIType *> &DITypeCache) { 1025 if (DIType *DT = DITypeCache.lookup(Ty)) 1026 return DT; 1027 1028 StringRef Name = solveTypeName(Ty); 1029 1030 DIType *RetType = nullptr; 1031 1032 if (Ty->isIntegerTy()) { 1033 auto BitWidth = cast<IntegerType>(Ty)->getBitWidth(); 1034 RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed, 1035 llvm::DINode::FlagArtificial); 1036 } else if (Ty->isFloatingPointTy()) { 1037 RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), 1038 dwarf::DW_ATE_float, 1039 llvm::DINode::FlagArtificial); 1040 } else if (Ty->isPointerTy()) { 1041 // Construct PointerType points to null (aka void *) instead of exploring 1042 // pointee type to avoid infinite search problem. For example, we would be 1043 // in trouble if we traverse recursively: 1044 // 1045 // struct Node { 1046 // Node* ptr; 1047 // }; 1048 RetType = 1049 Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), 1050 Layout.getABITypeAlign(Ty).value() * CHAR_BIT, 1051 /*DWARFAddressSpace=*/std::nullopt, Name); 1052 } else if (Ty->isStructTy()) { 1053 auto *DIStruct = Builder.createStructType( 1054 Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), 1055 Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT, 1056 llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); 1057 1058 auto *StructTy = cast<StructType>(Ty); 1059 SmallVector<Metadata *, 16> Elements; 1060 for (unsigned I = 0; I < StructTy->getNumElements(); I++) { 1061 DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout, 1062 Scope, LineNum, DITypeCache); 1063 assert(DITy); 1064 Elements.push_back(Builder.createMemberType( 1065 Scope, DITy->getName(), Scope->getFile(), LineNum, 1066 DITy->getSizeInBits(), DITy->getAlignInBits(), 1067 Layout.getStructLayout(StructTy)->getElementOffsetInBits(I), 1068 llvm::DINode::FlagArtificial, DITy)); 1069 } 1070 1071 Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements)); 1072 1073 RetType = DIStruct; 1074 } else { 1075 LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); 1076 TypeSize Size = Layout.getTypeSizeInBits(Ty); 1077 auto *CharSizeType = Builder.createBasicType( 1078 Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); 1079 1080 if (Size <= 8) 1081 RetType = CharSizeType; 1082 else { 1083 if (Size % 8 != 0) 1084 Size = TypeSize::getFixed(Size + 8 - (Size % 8)); 1085 1086 RetType = Builder.createArrayType( 1087 Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, 1088 Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); 1089 } 1090 } 1091 1092 DITypeCache.insert({Ty, RetType}); 1093 return RetType; 1094 } 1095 1096 /// Build artificial debug info for C++ coroutine frames to allow users to 1097 /// inspect the contents of the frame directly 1098 /// 1099 /// Create Debug information for coroutine frame with debug name "__coro_frame". 1100 /// The debug information for the fields of coroutine frame is constructed from 1101 /// the following way: 1102 /// 1. For all the value in the Frame, we search the use of dbg.declare to find 1103 /// the corresponding debug variables for the value. If we can find the 1104 /// debug variable, we can get full and accurate debug information. 1105 /// 2. If we can't get debug information in step 1 and 2, we could only try to 1106 /// build the DIType by Type. We did this in solveDIType. We only handle 1107 /// integer, float, double, integer type and struct type for now. 1108 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, 1109 FrameDataInfo &FrameData) { 1110 DISubprogram *DIS = F.getSubprogram(); 1111 // If there is no DISubprogram for F, it implies the Function are not compiled 1112 // with debug info. So we also don't need to generate debug info for the frame 1113 // neither. 1114 if (!DIS || !DIS->getUnit() || 1115 !dwarf::isCPlusPlus( 1116 (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage())) 1117 return; 1118 1119 assert(Shape.ABI == coro::ABI::Switch && 1120 "We could only build debug infomation for C++ coroutine now.\n"); 1121 1122 DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); 1123 1124 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1125 assert(PromiseAlloca && 1126 "Coroutine with switch ABI should own Promise alloca"); 1127 1128 DIFile *DFile = DIS->getFile(); 1129 unsigned LineNum = DIS->getLine(); 1130 1131 DICompositeType *FrameDITy = DBuilder.createStructType( 1132 DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), 1133 DFile, LineNum, Shape.FrameSize * 8, 1134 Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, 1135 llvm::DINodeArray()); 1136 StructType *FrameTy = Shape.FrameTy; 1137 SmallVector<Metadata *, 16> Elements; 1138 DataLayout Layout = F.getDataLayout(); 1139 1140 DenseMap<Value *, DILocalVariable *> DIVarCache; 1141 cacheDIVar(FrameData, DIVarCache); 1142 1143 unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume; 1144 unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy; 1145 unsigned IndexIndex = Shape.SwitchLowering.IndexField; 1146 1147 DenseMap<unsigned, StringRef> NameCache; 1148 NameCache.insert({ResumeIndex, "__resume_fn"}); 1149 NameCache.insert({DestroyIndex, "__destroy_fn"}); 1150 NameCache.insert({IndexIndex, "__coro_index"}); 1151 1152 Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex), 1153 *DestroyFnTy = FrameTy->getElementType(DestroyIndex), 1154 *IndexTy = FrameTy->getElementType(IndexIndex); 1155 1156 DenseMap<unsigned, DIType *> TyCache; 1157 TyCache.insert( 1158 {ResumeIndex, DBuilder.createPointerType( 1159 nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); 1160 TyCache.insert( 1161 {DestroyIndex, DBuilder.createPointerType( 1162 nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); 1163 1164 /// FIXME: If we fill the field `SizeInBits` with the actual size of 1165 /// __coro_index in bits, then __coro_index wouldn't show in the debugger. 1166 TyCache.insert({IndexIndex, DBuilder.createBasicType( 1167 "__coro_index", 1168 (Layout.getTypeSizeInBits(IndexTy) < 8) 1169 ? 8 1170 : Layout.getTypeSizeInBits(IndexTy), 1171 dwarf::DW_ATE_unsigned_char)}); 1172 1173 for (auto *V : FrameData.getAllDefs()) { 1174 if (!DIVarCache.contains(V)) 1175 continue; 1176 1177 auto Index = FrameData.getFieldIndex(V); 1178 1179 NameCache.insert({Index, DIVarCache[V]->getName()}); 1180 TyCache.insert({Index, DIVarCache[V]->getType()}); 1181 } 1182 1183 // Cache from index to (Align, Offset Pair) 1184 DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache; 1185 // The Align and Offset of Resume function and Destroy function are fixed. 1186 OffsetCache.insert({ResumeIndex, {8, 0}}); 1187 OffsetCache.insert({DestroyIndex, {8, 8}}); 1188 OffsetCache.insert( 1189 {IndexIndex, 1190 {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}}); 1191 1192 for (auto *V : FrameData.getAllDefs()) { 1193 auto Index = FrameData.getFieldIndex(V); 1194 1195 OffsetCache.insert( 1196 {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}}); 1197 } 1198 1199 DenseMap<Type *, DIType *> DITypeCache; 1200 // This counter is used to avoid same type names. e.g., there would be 1201 // many i32 and i64 types in one coroutine. And we would use i32_0 and 1202 // i32_1 to avoid the same type. Since it makes no sense the name of the 1203 // fields confilicts with each other. 1204 unsigned UnknownTypeNum = 0; 1205 for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { 1206 if (!OffsetCache.contains(Index)) 1207 continue; 1208 1209 std::string Name; 1210 uint64_t SizeInBits; 1211 uint32_t AlignInBits; 1212 uint64_t OffsetInBits; 1213 DIType *DITy = nullptr; 1214 1215 Type *Ty = FrameTy->getElementType(Index); 1216 assert(Ty->isSized() && "We can't handle type which is not sized.\n"); 1217 SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue(); 1218 AlignInBits = OffsetCache[Index].first * 8; 1219 OffsetInBits = OffsetCache[Index].second * 8; 1220 1221 if (NameCache.contains(Index)) { 1222 Name = NameCache[Index].str(); 1223 DITy = TyCache[Index]; 1224 } else { 1225 DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache); 1226 assert(DITy && "SolveDIType shouldn't return nullptr.\n"); 1227 Name = DITy->getName().str(); 1228 Name += "_" + std::to_string(UnknownTypeNum); 1229 UnknownTypeNum++; 1230 } 1231 1232 Elements.push_back(DBuilder.createMemberType( 1233 FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits, 1234 llvm::DINode::FlagArtificial, DITy)); 1235 } 1236 1237 DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); 1238 1239 auto *FrameDIVar = 1240 DBuilder.createAutoVariable(DIS, "__coro_frame", DFile, LineNum, 1241 FrameDITy, true, DINode::FlagArtificial); 1242 1243 // Subprogram would have ContainedNodes field which records the debug 1244 // variables it contained. So we need to add __coro_frame to the 1245 // ContainedNodes of it. 1246 // 1247 // If we don't add __coro_frame to the RetainedNodes, user may get 1248 // `no symbol __coro_frame in context` rather than `__coro_frame` 1249 // is optimized out, which is more precise. 1250 auto RetainedNodes = DIS->getRetainedNodes(); 1251 SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(), 1252 RetainedNodes.end()); 1253 RetainedNodesVec.push_back(FrameDIVar); 1254 DIS->replaceOperandWith(7, (MDTuple::get(F.getContext(), RetainedNodesVec))); 1255 1256 // Construct the location for the frame debug variable. The column number 1257 // is fake but it should be fine. 1258 DILocation *DILoc = 1259 DILocation::get(DIS->getContext(), LineNum, /*Column=*/1, DIS); 1260 assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); 1261 1262 if (UseNewDbgInfoFormat) { 1263 DbgVariableRecord *NewDVR = 1264 new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar, 1265 DBuilder.createExpression(), DILoc, 1266 DbgVariableRecord::LocationType::Declare); 1267 BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr(); 1268 It->getParent()->insertDbgRecordBefore(NewDVR, It); 1269 } else { 1270 DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, 1271 DBuilder.createExpression(), DILoc, 1272 &*Shape.getInsertPtAfterFramePtr()); 1273 } 1274 } 1275 1276 // Build a struct that will keep state for an active coroutine. 1277 // struct f.frame { 1278 // ResumeFnTy ResumeFnAddr; 1279 // ResumeFnTy DestroyFnAddr; 1280 // ... promise (if present) ... 1281 // int ResumeIndex; 1282 // ... spills ... 1283 // }; 1284 static StructType *buildFrameType(Function &F, coro::Shape &Shape, 1285 FrameDataInfo &FrameData) { 1286 LLVMContext &C = F.getContext(); 1287 const DataLayout &DL = F.getDataLayout(); 1288 StructType *FrameTy = [&] { 1289 SmallString<32> Name(F.getName()); 1290 Name.append(".Frame"); 1291 return StructType::create(C, Name); 1292 }(); 1293 1294 // We will use this value to cap the alignment of spilled values. 1295 std::optional<Align> MaxFrameAlignment; 1296 if (Shape.ABI == coro::ABI::Async) 1297 MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment(); 1298 FrameTypeBuilder B(C, DL, MaxFrameAlignment); 1299 1300 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1301 std::optional<FieldIDType> SwitchIndexFieldId; 1302 1303 if (Shape.ABI == coro::ABI::Switch) { 1304 auto *FnPtrTy = PointerType::getUnqual(C); 1305 1306 // Add header fields for the resume and destroy functions. 1307 // We can rely on these being perfectly packed. 1308 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1309 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1310 1311 // PromiseAlloca field needs to be explicitly added here because it's 1312 // a header field with a fixed offset based on its alignment. Hence it 1313 // needs special handling and cannot be added to FrameData.Allocas. 1314 if (PromiseAlloca) 1315 FrameData.setFieldIndex( 1316 PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true)); 1317 1318 // Add a field to store the suspend index. This doesn't need to 1319 // be in the header. 1320 unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); 1321 Type *IndexType = Type::getIntNTy(C, IndexBits); 1322 1323 SwitchIndexFieldId = B.addField(IndexType, std::nullopt); 1324 } else { 1325 assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); 1326 } 1327 1328 // Because multiple allocas may own the same field slot, 1329 // we add allocas to field here. 1330 B.addFieldForAllocas(F, FrameData, Shape); 1331 // Add PromiseAlloca to Allocas list so that 1332 // 1. updateLayoutIndex could update its index after 1333 // `performOptimizedStructLayout` 1334 // 2. it is processed in insertSpills. 1335 if (Shape.ABI == coro::ABI::Switch && PromiseAlloca) 1336 // We assume that the promise alloca won't be modified before 1337 // CoroBegin and no alias will be create before CoroBegin. 1338 FrameData.Allocas.emplace_back( 1339 PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false); 1340 // Create an entry for every spilled value. 1341 for (auto &S : FrameData.Spills) { 1342 Type *FieldType = S.first->getType(); 1343 // For byval arguments, we need to store the pointed value in the frame, 1344 // instead of the pointer itself. 1345 if (const Argument *A = dyn_cast<Argument>(S.first)) 1346 if (A->hasByValAttr()) 1347 FieldType = A->getParamByValType(); 1348 FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/, 1349 true /*IsSpillOfValue*/); 1350 FrameData.setFieldIndex(S.first, Id); 1351 } 1352 1353 B.finish(FrameTy); 1354 FrameData.updateLayoutIndex(B); 1355 Shape.FrameAlign = B.getStructAlign(); 1356 Shape.FrameSize = B.getStructSize(); 1357 1358 switch (Shape.ABI) { 1359 case coro::ABI::Switch: { 1360 // In the switch ABI, remember the switch-index field. 1361 auto IndexField = B.getLayoutField(*SwitchIndexFieldId); 1362 Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex; 1363 Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value(); 1364 Shape.SwitchLowering.IndexOffset = IndexField.Offset; 1365 1366 // Also round the frame size up to a multiple of its alignment, as is 1367 // generally expected in C/C++. 1368 Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign); 1369 break; 1370 } 1371 1372 // In the retcon ABI, remember whether the frame is inline in the storage. 1373 case coro::ABI::Retcon: 1374 case coro::ABI::RetconOnce: { 1375 auto Id = Shape.getRetconCoroId(); 1376 Shape.RetconLowering.IsFrameInlineInStorage 1377 = (B.getStructSize() <= Id->getStorageSize() && 1378 B.getStructAlign() <= Id->getStorageAlignment()); 1379 break; 1380 } 1381 case coro::ABI::Async: { 1382 Shape.AsyncLowering.FrameOffset = 1383 alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign); 1384 // Also make the final context size a multiple of the context alignment to 1385 // make allocation easier for allocators. 1386 Shape.AsyncLowering.ContextSize = 1387 alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize, 1388 Shape.AsyncLowering.getContextAlignment()); 1389 if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) { 1390 report_fatal_error( 1391 "The alignment requirment of frame variables cannot be higher than " 1392 "the alignment of the async function context"); 1393 } 1394 break; 1395 } 1396 } 1397 1398 return FrameTy; 1399 } 1400 1401 // We use a pointer use visitor to track how an alloca is being used. 1402 // The goal is to be able to answer the following three questions: 1403 // 1. Should this alloca be allocated on the frame instead. 1404 // 2. Could the content of the alloca be modified prior to CoroBegn, which would 1405 // require copying the data from alloca to the frame after CoroBegin. 1406 // 3. Is there any alias created for this alloca prior to CoroBegin, but used 1407 // after CoroBegin. In that case, we will need to recreate the alias after 1408 // CoroBegin based off the frame. To answer question 1, we track two things: 1409 // a. List of all BasicBlocks that use this alloca or any of the aliases of 1410 // the alloca. In the end, we check if there exists any two basic blocks that 1411 // cross suspension points. If so, this alloca must be put on the frame. b. 1412 // Whether the alloca or any alias of the alloca is escaped at some point, 1413 // either by storing the address somewhere, or the address is used in a 1414 // function call that might capture. If it's ever escaped, this alloca must be 1415 // put on the frame conservatively. 1416 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin. 1417 // Whenever a potential write happens, either through a store instruction, a 1418 // function call or any of the memory intrinsics, we check whether this 1419 // instruction is prior to CoroBegin. To answer question 3, we track the offsets 1420 // of all aliases created for the alloca prior to CoroBegin but used after 1421 // CoroBegin. std::optional is used to be able to represent the case when the 1422 // offset is unknown (e.g. when you have a PHINode that takes in different 1423 // offset values). We cannot handle unknown offsets and will assert. This is the 1424 // potential issue left out. An ideal solution would likely require a 1425 // significant redesign. 1426 namespace { 1427 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { 1428 using Base = PtrUseVisitor<AllocaUseVisitor>; 1429 AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, 1430 const coro::Shape &CoroShape, 1431 const SuspendCrossingInfo &Checker, 1432 bool ShouldUseLifetimeStartInfo) 1433 : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker), 1434 ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) { 1435 for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends) 1436 CoroSuspendBBs.insert(SuspendInst->getParent()); 1437 } 1438 1439 void visit(Instruction &I) { 1440 Users.insert(&I); 1441 Base::visit(I); 1442 // If the pointer is escaped prior to CoroBegin, we have to assume it would 1443 // be written into before CoroBegin as well. 1444 if (PI.isEscaped() && 1445 !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) { 1446 MayWriteBeforeCoroBegin = true; 1447 } 1448 } 1449 // We need to provide this overload as PtrUseVisitor uses a pointer based 1450 // visiting function. 1451 void visit(Instruction *I) { return visit(*I); } 1452 1453 void visitPHINode(PHINode &I) { 1454 enqueueUsers(I); 1455 handleAlias(I); 1456 } 1457 1458 void visitSelectInst(SelectInst &I) { 1459 enqueueUsers(I); 1460 handleAlias(I); 1461 } 1462 1463 void visitStoreInst(StoreInst &SI) { 1464 // Regardless whether the alias of the alloca is the value operand or the 1465 // pointer operand, we need to assume the alloca is been written. 1466 handleMayWrite(SI); 1467 1468 if (SI.getValueOperand() != U->get()) 1469 return; 1470 1471 // We are storing the pointer into a memory location, potentially escaping. 1472 // As an optimization, we try to detect simple cases where it doesn't 1473 // actually escape, for example: 1474 // %ptr = alloca .. 1475 // %addr = alloca .. 1476 // store %ptr, %addr 1477 // %x = load %addr 1478 // .. 1479 // If %addr is only used by loading from it, we could simply treat %x as 1480 // another alias of %ptr, and not considering %ptr being escaped. 1481 auto IsSimpleStoreThenLoad = [&]() { 1482 auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand()); 1483 // If the memory location we are storing to is not an alloca, it 1484 // could be an alias of some other memory locations, which is difficult 1485 // to analyze. 1486 if (!AI) 1487 return false; 1488 // StoreAliases contains aliases of the memory location stored into. 1489 SmallVector<Instruction *, 4> StoreAliases = {AI}; 1490 while (!StoreAliases.empty()) { 1491 Instruction *I = StoreAliases.pop_back_val(); 1492 for (User *U : I->users()) { 1493 // If we are loading from the memory location, we are creating an 1494 // alias of the original pointer. 1495 if (auto *LI = dyn_cast<LoadInst>(U)) { 1496 enqueueUsers(*LI); 1497 handleAlias(*LI); 1498 continue; 1499 } 1500 // If we are overriding the memory location, the pointer certainly 1501 // won't escape. 1502 if (auto *S = dyn_cast<StoreInst>(U)) 1503 if (S->getPointerOperand() == I) 1504 continue; 1505 if (auto *II = dyn_cast<IntrinsicInst>(U)) 1506 if (II->isLifetimeStartOrEnd()) 1507 continue; 1508 // BitCastInst creats aliases of the memory location being stored 1509 // into. 1510 if (auto *BI = dyn_cast<BitCastInst>(U)) { 1511 StoreAliases.push_back(BI); 1512 continue; 1513 } 1514 return false; 1515 } 1516 } 1517 1518 return true; 1519 }; 1520 1521 if (!IsSimpleStoreThenLoad()) 1522 PI.setEscaped(&SI); 1523 } 1524 1525 // All mem intrinsics modify the data. 1526 void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); } 1527 1528 void visitBitCastInst(BitCastInst &BC) { 1529 Base::visitBitCastInst(BC); 1530 handleAlias(BC); 1531 } 1532 1533 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1534 Base::visitAddrSpaceCastInst(ASC); 1535 handleAlias(ASC); 1536 } 1537 1538 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1539 // The base visitor will adjust Offset accordingly. 1540 Base::visitGetElementPtrInst(GEPI); 1541 handleAlias(GEPI); 1542 } 1543 1544 void visitIntrinsicInst(IntrinsicInst &II) { 1545 // When we found the lifetime markers refers to a 1546 // subrange of the original alloca, ignore the lifetime 1547 // markers to avoid misleading the analysis. 1548 if (!IsOffsetKnown || !Offset.isZero()) 1549 return Base::visitIntrinsicInst(II); 1550 switch (II.getIntrinsicID()) { 1551 default: 1552 return Base::visitIntrinsicInst(II); 1553 case Intrinsic::lifetime_start: 1554 LifetimeStarts.insert(&II); 1555 LifetimeStartBBs.push_back(II.getParent()); 1556 break; 1557 case Intrinsic::lifetime_end: 1558 LifetimeEndBBs.insert(II.getParent()); 1559 break; 1560 } 1561 } 1562 1563 void visitCallBase(CallBase &CB) { 1564 for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op) 1565 if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op)) 1566 PI.setEscaped(&CB); 1567 handleMayWrite(CB); 1568 } 1569 1570 bool getShouldLiveOnFrame() const { 1571 if (!ShouldLiveOnFrame) 1572 ShouldLiveOnFrame = computeShouldLiveOnFrame(); 1573 return *ShouldLiveOnFrame; 1574 } 1575 1576 bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; } 1577 1578 DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const { 1579 assert(getShouldLiveOnFrame() && "This method should only be called if the " 1580 "alloca needs to live on the frame."); 1581 for (const auto &P : AliasOffetMap) 1582 if (!P.second) 1583 report_fatal_error("Unable to handle an alias with unknown offset " 1584 "created before CoroBegin."); 1585 return AliasOffetMap; 1586 } 1587 1588 private: 1589 const DominatorTree &DT; 1590 const coro::Shape &CoroShape; 1591 const SuspendCrossingInfo &Checker; 1592 // All alias to the original AllocaInst, created before CoroBegin and used 1593 // after CoroBegin. Each entry contains the instruction and the offset in the 1594 // original Alloca. They need to be recreated after CoroBegin off the frame. 1595 DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{}; 1596 SmallPtrSet<Instruction *, 4> Users{}; 1597 SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; 1598 SmallVector<BasicBlock *> LifetimeStartBBs{}; 1599 SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{}; 1600 SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{}; 1601 bool MayWriteBeforeCoroBegin{false}; 1602 bool ShouldUseLifetimeStartInfo{true}; 1603 1604 mutable std::optional<bool> ShouldLiveOnFrame{}; 1605 1606 bool computeShouldLiveOnFrame() const { 1607 // If lifetime information is available, we check it first since it's 1608 // more precise. We look at every pair of lifetime.start intrinsic and 1609 // every basic block that uses the pointer to see if they cross suspension 1610 // points. The uses cover both direct uses as well as indirect uses. 1611 if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { 1612 // If there is no explicit lifetime.end, then assume the address can 1613 // cross suspension points. 1614 if (LifetimeEndBBs.empty()) 1615 return true; 1616 1617 // If there is a path from a lifetime.start to a suspend without a 1618 // corresponding lifetime.end, then the alloca's lifetime persists 1619 // beyond that suspension point and the alloca must go on the frame. 1620 llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs); 1621 if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs, 1622 &LifetimeEndBBs, &DT)) 1623 return true; 1624 1625 // Addresses are guaranteed to be identical after every lifetime.start so 1626 // we cannot use the local stack if the address escaped and there is a 1627 // suspend point between lifetime markers. This should also cover the 1628 // case of a single lifetime.start intrinsic in a loop with suspend point. 1629 if (PI.isEscaped()) { 1630 for (auto *A : LifetimeStarts) { 1631 for (auto *B : LifetimeStarts) { 1632 if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(), 1633 B->getParent())) 1634 return true; 1635 } 1636 } 1637 } 1638 return false; 1639 } 1640 // FIXME: Ideally the isEscaped check should come at the beginning. 1641 // However there are a few loose ends that need to be fixed first before 1642 // we can do that. We need to make sure we are not over-conservative, so 1643 // that the data accessed in-between await_suspend and symmetric transfer 1644 // is always put on the stack, and also data accessed after coro.end is 1645 // always put on the stack (esp the return object). To fix that, we need 1646 // to: 1647 // 1) Potentially treat sret as nocapture in calls 1648 // 2) Special handle the return object and put it on the stack 1649 // 3) Utilize lifetime.end intrinsic 1650 if (PI.isEscaped()) 1651 return true; 1652 1653 for (auto *U1 : Users) 1654 for (auto *U2 : Users) 1655 if (Checker.isDefinitionAcrossSuspend(*U1, U2)) 1656 return true; 1657 1658 return false; 1659 } 1660 1661 void handleMayWrite(const Instruction &I) { 1662 if (!DT.dominates(CoroShape.CoroBegin, &I)) 1663 MayWriteBeforeCoroBegin = true; 1664 } 1665 1666 bool usedAfterCoroBegin(Instruction &I) { 1667 for (auto &U : I.uses()) 1668 if (DT.dominates(CoroShape.CoroBegin, U)) 1669 return true; 1670 return false; 1671 } 1672 1673 void handleAlias(Instruction &I) { 1674 // We track all aliases created prior to CoroBegin but used after. 1675 // These aliases may need to be recreated after CoroBegin if the alloca 1676 // need to live on the frame. 1677 if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I)) 1678 return; 1679 1680 if (!IsOffsetKnown) { 1681 AliasOffetMap[&I].reset(); 1682 } else { 1683 auto Itr = AliasOffetMap.find(&I); 1684 if (Itr == AliasOffetMap.end()) { 1685 AliasOffetMap[&I] = Offset; 1686 } else if (Itr->second && *Itr->second != Offset) { 1687 // If we have seen two different possible values for this alias, we set 1688 // it to empty. 1689 AliasOffetMap[&I].reset(); 1690 } 1691 } 1692 } 1693 }; 1694 } // namespace 1695 1696 // We need to make room to insert a spill after initial PHIs, but before 1697 // catchswitch instruction. Placing it before violates the requirement that 1698 // catchswitch, like all other EHPads must be the first nonPHI in a block. 1699 // 1700 // Split away catchswitch into a separate block and insert in its place: 1701 // 1702 // cleanuppad <InsertPt> cleanupret. 1703 // 1704 // cleanupret instruction will act as an insert point for the spill. 1705 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { 1706 BasicBlock *CurrentBlock = CatchSwitch->getParent(); 1707 BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); 1708 CurrentBlock->getTerminator()->eraseFromParent(); 1709 1710 auto *CleanupPad = 1711 CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); 1712 auto *CleanupRet = 1713 CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); 1714 return CleanupRet; 1715 } 1716 1717 // Replace all alloca and SSA values that are accessed across suspend points 1718 // with GetElementPointer from coroutine frame + loads and stores. Create an 1719 // AllocaSpillBB that will become the new entry block for the resume parts of 1720 // the coroutine: 1721 // 1722 // %hdl = coro.begin(...) 1723 // whatever 1724 // 1725 // becomes: 1726 // 1727 // %hdl = coro.begin(...) 1728 // br label %AllocaSpillBB 1729 // 1730 // AllocaSpillBB: 1731 // ; geps corresponding to allocas that were moved to coroutine frame 1732 // br label PostSpill 1733 // 1734 // PostSpill: 1735 // whatever 1736 // 1737 // 1738 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { 1739 auto *CB = Shape.CoroBegin; 1740 LLVMContext &C = CB->getContext(); 1741 Function *F = CB->getFunction(); 1742 IRBuilder<> Builder(C); 1743 StructType *FrameTy = Shape.FrameTy; 1744 Value *FramePtr = Shape.FramePtr; 1745 DominatorTree DT(*F); 1746 SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; 1747 1748 // Create a GEP with the given index into the coroutine frame for the original 1749 // value Orig. Appends an extra 0 index for array-allocas, preserving the 1750 // original type. 1751 auto GetFramePointer = [&](Value *Orig) -> Value * { 1752 FieldIDType Index = FrameData.getFieldIndex(Orig); 1753 SmallVector<Value *, 3> Indices = { 1754 ConstantInt::get(Type::getInt32Ty(C), 0), 1755 ConstantInt::get(Type::getInt32Ty(C), Index), 1756 }; 1757 1758 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1759 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { 1760 auto Count = CI->getValue().getZExtValue(); 1761 if (Count > 1) { 1762 Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0)); 1763 } 1764 } else { 1765 report_fatal_error("Coroutines cannot handle non static allocas yet"); 1766 } 1767 } 1768 1769 auto GEP = cast<GetElementPtrInst>( 1770 Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); 1771 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1772 if (FrameData.getDynamicAlign(Orig) != 0) { 1773 assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value()); 1774 auto *M = AI->getModule(); 1775 auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); 1776 auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); 1777 auto *AlignMask = 1778 ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1); 1779 PtrValue = Builder.CreateAdd(PtrValue, AlignMask); 1780 PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); 1781 return Builder.CreateIntToPtr(PtrValue, AI->getType()); 1782 } 1783 // If the type of GEP is not equal to the type of AllocaInst, it implies 1784 // that the AllocaInst may be reused in the Frame slot of other 1785 // AllocaInst. So We cast GEP to the AllocaInst here to re-use 1786 // the Frame storage. 1787 // 1788 // Note: If we change the strategy dealing with alignment, we need to refine 1789 // this casting. 1790 if (GEP->getType() != Orig->getType()) 1791 return Builder.CreateAddrSpaceCast(GEP, Orig->getType(), 1792 Orig->getName() + Twine(".cast")); 1793 } 1794 return GEP; 1795 }; 1796 1797 for (auto const &E : FrameData.Spills) { 1798 Value *Def = E.first; 1799 auto SpillAlignment = Align(FrameData.getAlign(Def)); 1800 // Create a store instruction storing the value into the 1801 // coroutine frame. 1802 BasicBlock::iterator InsertPt; 1803 Type *ByValTy = nullptr; 1804 if (auto *Arg = dyn_cast<Argument>(Def)) { 1805 // For arguments, we will place the store instruction right after 1806 // the coroutine frame pointer instruction, i.e. coro.begin. 1807 InsertPt = Shape.getInsertPtAfterFramePtr(); 1808 1809 // If we're spilling an Argument, make sure we clear 'nocapture' 1810 // from the coroutine function. 1811 Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); 1812 1813 if (Arg->hasByValAttr()) 1814 ByValTy = Arg->getParamByValType(); 1815 } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) { 1816 // Don't spill immediately after a suspend; splitting assumes 1817 // that the suspend will be followed by a branch. 1818 InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt(); 1819 } else { 1820 auto *I = cast<Instruction>(Def); 1821 if (!DT.dominates(CB, I)) { 1822 // If it is not dominated by CoroBegin, then spill should be 1823 // inserted immediately after CoroFrame is computed. 1824 InsertPt = Shape.getInsertPtAfterFramePtr(); 1825 } else if (auto *II = dyn_cast<InvokeInst>(I)) { 1826 // If we are spilling the result of the invoke instruction, split 1827 // the normal edge and insert the spill in the new block. 1828 auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); 1829 InsertPt = NewBB->getTerminator()->getIterator(); 1830 } else if (isa<PHINode>(I)) { 1831 // Skip the PHINodes and EH pads instructions. 1832 BasicBlock *DefBlock = I->getParent(); 1833 if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) 1834 InsertPt = splitBeforeCatchSwitch(CSI)->getIterator(); 1835 else 1836 InsertPt = DefBlock->getFirstInsertionPt(); 1837 } else { 1838 assert(!I->isTerminator() && "unexpected terminator"); 1839 // For all other values, the spill is placed immediately after 1840 // the definition. 1841 InsertPt = I->getNextNode()->getIterator(); 1842 } 1843 } 1844 1845 auto Index = FrameData.getFieldIndex(Def); 1846 Builder.SetInsertPoint(InsertPt->getParent(), InsertPt); 1847 auto *G = Builder.CreateConstInBoundsGEP2_32( 1848 FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr")); 1849 if (ByValTy) { 1850 // For byval arguments, we need to store the pointed value in the frame, 1851 // instead of the pointer itself. 1852 auto *Value = Builder.CreateLoad(ByValTy, Def); 1853 Builder.CreateAlignedStore(Value, G, SpillAlignment); 1854 } else { 1855 Builder.CreateAlignedStore(Def, G, SpillAlignment); 1856 } 1857 1858 BasicBlock *CurrentBlock = nullptr; 1859 Value *CurrentReload = nullptr; 1860 for (auto *U : E.second) { 1861 // If we have not seen the use block, create a load instruction to reload 1862 // the spilled value from the coroutine frame. Populates the Value pointer 1863 // reference provided with the frame GEP. 1864 if (CurrentBlock != U->getParent()) { 1865 CurrentBlock = U->getParent(); 1866 Builder.SetInsertPoint(CurrentBlock, 1867 CurrentBlock->getFirstInsertionPt()); 1868 1869 auto *GEP = GetFramePointer(E.first); 1870 GEP->setName(E.first->getName() + Twine(".reload.addr")); 1871 if (ByValTy) 1872 CurrentReload = GEP; 1873 else 1874 CurrentReload = Builder.CreateAlignedLoad( 1875 FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, 1876 SpillAlignment, E.first->getName() + Twine(".reload")); 1877 1878 TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def); 1879 TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def); 1880 // Try best to find dbg.declare. If the spill is a temp, there may not 1881 // be a direct dbg.declare. Walk up the load chain to find one from an 1882 // alias. 1883 if (F->getSubprogram()) { 1884 auto *CurDef = Def; 1885 while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) { 1886 auto *LdInst = cast<LoadInst>(CurDef); 1887 // Only consider ptr to ptr same type load. 1888 if (LdInst->getPointerOperandType() != LdInst->getType()) 1889 break; 1890 CurDef = LdInst->getPointerOperand(); 1891 if (!isa<AllocaInst, LoadInst>(CurDef)) 1892 break; 1893 DIs = findDbgDeclares(CurDef); 1894 DVRs = findDVRDeclares(CurDef); 1895 } 1896 } 1897 1898 auto SalvageOne = [&](auto *DDI) { 1899 bool AllowUnresolved = false; 1900 // This dbg.declare is preserved for all coro-split function 1901 // fragments. It will be unreachable in the main function, and 1902 // processed by coro::salvageDebugInfo() by CoroCloner. 1903 if (UseNewDbgInfoFormat) { 1904 DbgVariableRecord *NewDVR = new DbgVariableRecord( 1905 ValueAsMetadata::get(CurrentReload), DDI->getVariable(), 1906 DDI->getExpression(), DDI->getDebugLoc(), 1907 DbgVariableRecord::LocationType::Declare); 1908 Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore( 1909 NewDVR, Builder.GetInsertPoint()); 1910 } else { 1911 DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) 1912 .insertDeclare(CurrentReload, DDI->getVariable(), 1913 DDI->getExpression(), DDI->getDebugLoc(), 1914 &*Builder.GetInsertPoint()); 1915 } 1916 // This dbg.declare is for the main function entry point. It 1917 // will be deleted in all coro-split functions. 1918 coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, 1919 false /*UseEntryValue*/); 1920 }; 1921 for_each(DIs, SalvageOne); 1922 for_each(DVRs, SalvageOne); 1923 } 1924 1925 // If we have a single edge PHINode, remove it and replace it with a 1926 // reload from the coroutine frame. (We already took care of multi edge 1927 // PHINodes by rewriting them in the rewritePHIs function). 1928 if (auto *PN = dyn_cast<PHINode>(U)) { 1929 assert(PN->getNumIncomingValues() == 1 && 1930 "unexpected number of incoming " 1931 "values in the PHINode"); 1932 PN->replaceAllUsesWith(CurrentReload); 1933 PN->eraseFromParent(); 1934 continue; 1935 } 1936 1937 // Replace all uses of CurrentValue in the current instruction with 1938 // reload. 1939 U->replaceUsesOfWith(Def, CurrentReload); 1940 // Instructions are added to Def's user list if the attached 1941 // debug records use Def. Update those now. 1942 for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange())) 1943 DVR.replaceVariableLocationOp(Def, CurrentReload, true); 1944 } 1945 } 1946 1947 BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent(); 1948 1949 auto SpillBlock = FramePtrBB->splitBasicBlock( 1950 Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB"); 1951 SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); 1952 Shape.AllocaSpillBlock = SpillBlock; 1953 1954 // retcon and retcon.once lowering assumes all uses have been sunk. 1955 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 1956 Shape.ABI == coro::ABI::Async) { 1957 // If we found any allocas, replace all of their remaining uses with Geps. 1958 Builder.SetInsertPoint(SpillBlock, SpillBlock->begin()); 1959 for (const auto &P : FrameData.Allocas) { 1960 AllocaInst *Alloca = P.Alloca; 1961 auto *G = GetFramePointer(Alloca); 1962 1963 // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) 1964 // here, as we are changing location of the instruction. 1965 G->takeName(Alloca); 1966 Alloca->replaceAllUsesWith(G); 1967 Alloca->eraseFromParent(); 1968 } 1969 return; 1970 } 1971 1972 // If we found any alloca, replace all of their remaining uses with GEP 1973 // instructions. To remain debugbility, we replace the uses of allocas for 1974 // dbg.declares and dbg.values with the reload from the frame. 1975 // Note: We cannot replace the alloca with GEP instructions indiscriminately, 1976 // as some of the uses may not be dominated by CoroBegin. 1977 Builder.SetInsertPoint(Shape.AllocaSpillBlock, 1978 Shape.AllocaSpillBlock->begin()); 1979 SmallVector<Instruction *, 4> UsersToUpdate; 1980 for (const auto &A : FrameData.Allocas) { 1981 AllocaInst *Alloca = A.Alloca; 1982 UsersToUpdate.clear(); 1983 for (User *U : Alloca->users()) { 1984 auto *I = cast<Instruction>(U); 1985 if (DT.dominates(CB, I)) 1986 UsersToUpdate.push_back(I); 1987 } 1988 if (UsersToUpdate.empty()) 1989 continue; 1990 auto *G = GetFramePointer(Alloca); 1991 G->setName(Alloca->getName() + Twine(".reload.addr")); 1992 1993 SmallVector<DbgVariableIntrinsic *, 4> DIs; 1994 SmallVector<DbgVariableRecord *> DbgVariableRecords; 1995 findDbgUsers(DIs, Alloca, &DbgVariableRecords); 1996 for (auto *DVI : DIs) 1997 DVI->replaceUsesOfWith(Alloca, G); 1998 for (auto *DVR : DbgVariableRecords) 1999 DVR->replaceVariableLocationOp(Alloca, G); 2000 2001 for (Instruction *I : UsersToUpdate) { 2002 // It is meaningless to retain the lifetime intrinsics refer for the 2003 // member of coroutine frames and the meaningless lifetime intrinsics 2004 // are possible to block further optimizations. 2005 if (I->isLifetimeStartOrEnd()) { 2006 I->eraseFromParent(); 2007 continue; 2008 } 2009 2010 I->replaceUsesOfWith(Alloca, G); 2011 } 2012 } 2013 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2014 for (const auto &A : FrameData.Allocas) { 2015 AllocaInst *Alloca = A.Alloca; 2016 if (A.MayWriteBeforeCoroBegin) { 2017 // isEscaped really means potentially modified before CoroBegin. 2018 if (Alloca->isArrayAllocation()) 2019 report_fatal_error( 2020 "Coroutines cannot handle copying of array allocas yet"); 2021 2022 auto *G = GetFramePointer(Alloca); 2023 auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); 2024 Builder.CreateStore(Value, G); 2025 } 2026 // For each alias to Alloca created before CoroBegin but used after 2027 // CoroBegin, we recreate them after CoroBegin by appplying the offset 2028 // to the pointer in the frame. 2029 for (const auto &Alias : A.Aliases) { 2030 auto *FramePtr = GetFramePointer(Alloca); 2031 auto &Value = *Alias.second; 2032 auto ITy = IntegerType::get(C, Value.getBitWidth()); 2033 auto *AliasPtr = 2034 Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value)); 2035 Alias.first->replaceUsesWithIf( 2036 AliasPtr, [&](Use &U) { return DT.dominates(CB, U); }); 2037 } 2038 } 2039 2040 // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle 2041 // the case that the PromiseAlloca may have writes before CoroBegin in the 2042 // above codes. And it may be problematic in edge cases. See 2043 // https://github.com/llvm/llvm-project/issues/57861 for an example. 2044 if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) { 2045 AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca; 2046 // If there is memory accessing to promise alloca before CoroBegin; 2047 bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) { 2048 auto *Inst = dyn_cast<Instruction>(U.getUser()); 2049 if (!Inst || DT.dominates(CB, Inst)) 2050 return false; 2051 2052 if (auto *CI = dyn_cast<CallInst>(Inst)) { 2053 // It is fine if the call wouldn't write to the Promise. 2054 // This is possible for @llvm.coro.id intrinsics, which 2055 // would take the promise as the second argument as a 2056 // marker. 2057 if (CI->onlyReadsMemory() || 2058 CI->onlyReadsMemory(CI->getArgOperandNo(&U))) 2059 return false; 2060 return true; 2061 } 2062 2063 return isa<StoreInst>(Inst) || 2064 // It may take too much time to track the uses. 2065 // Be conservative about the case the use may escape. 2066 isa<GetElementPtrInst>(Inst) || 2067 // There would always be a bitcast for the promise alloca 2068 // before we enabled Opaque pointers. And now given 2069 // opaque pointers are enabled by default. This should be 2070 // fine. 2071 isa<BitCastInst>(Inst); 2072 }); 2073 if (HasAccessingPromiseBeforeCB) { 2074 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2075 auto *G = GetFramePointer(PA); 2076 auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA); 2077 Builder.CreateStore(Value, G); 2078 } 2079 } 2080 } 2081 2082 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new 2083 // PHI in InsertedBB. 2084 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB, 2085 BasicBlock *InsertedBB, 2086 BasicBlock *PredBB, 2087 PHINode *UntilPHI = nullptr) { 2088 auto *PN = cast<PHINode>(&SuccBB->front()); 2089 do { 2090 int Index = PN->getBasicBlockIndex(InsertedBB); 2091 Value *V = PN->getIncomingValue(Index); 2092 PHINode *InputV = PHINode::Create( 2093 V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName()); 2094 InputV->insertBefore(InsertedBB->begin()); 2095 InputV->addIncoming(V, PredBB); 2096 PN->setIncomingValue(Index, InputV); 2097 PN = dyn_cast<PHINode>(PN->getNextNode()); 2098 } while (PN != UntilPHI); 2099 } 2100 2101 // Rewrites the PHI Nodes in a cleanuppad. 2102 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB, 2103 CleanupPadInst *CleanupPad) { 2104 // For every incoming edge to a CleanupPad we will create a new block holding 2105 // all incoming values in single-value PHI nodes. We will then create another 2106 // block to act as a dispather (as all unwind edges for related EH blocks 2107 // must be the same). 2108 // 2109 // cleanuppad: 2110 // %2 = phi i32[%0, %catchswitch], [%1, %catch.1] 2111 // %3 = cleanuppad within none [] 2112 // 2113 // It will create: 2114 // 2115 // cleanuppad.corodispatch 2116 // %2 = phi i8[0, %catchswitch], [1, %catch.1] 2117 // %3 = cleanuppad within none [] 2118 // switch i8 % 2, label %unreachable 2119 // [i8 0, label %cleanuppad.from.catchswitch 2120 // i8 1, label %cleanuppad.from.catch.1] 2121 // cleanuppad.from.catchswitch: 2122 // %4 = phi i32 [%0, %catchswitch] 2123 // br %label cleanuppad 2124 // cleanuppad.from.catch.1: 2125 // %6 = phi i32 [%1, %catch.1] 2126 // br %label cleanuppad 2127 // cleanuppad: 2128 // %8 = phi i32 [%4, %cleanuppad.from.catchswitch], 2129 // [%6, %cleanuppad.from.catch.1] 2130 2131 // Unreachable BB, in case switching on an invalid value in the dispatcher. 2132 auto *UnreachBB = BasicBlock::Create( 2133 CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent()); 2134 IRBuilder<> Builder(UnreachBB); 2135 Builder.CreateUnreachable(); 2136 2137 // Create a new cleanuppad which will be the dispatcher. 2138 auto *NewCleanupPadBB = 2139 BasicBlock::Create(CleanupPadBB->getContext(), 2140 CleanupPadBB->getName() + Twine(".corodispatch"), 2141 CleanupPadBB->getParent(), CleanupPadBB); 2142 Builder.SetInsertPoint(NewCleanupPadBB); 2143 auto *SwitchType = Builder.getInt8Ty(); 2144 auto *SetDispatchValuePN = 2145 Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB)); 2146 CleanupPad->removeFromParent(); 2147 CleanupPad->insertAfter(SetDispatchValuePN); 2148 auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB, 2149 pred_size(CleanupPadBB)); 2150 2151 int SwitchIndex = 0; 2152 SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB)); 2153 for (BasicBlock *Pred : Preds) { 2154 // Create a new cleanuppad and move the PHI values to there. 2155 auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(), 2156 CleanupPadBB->getName() + 2157 Twine(".from.") + Pred->getName(), 2158 CleanupPadBB->getParent(), CleanupPadBB); 2159 updatePhiNodes(CleanupPadBB, Pred, CaseBB); 2160 CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") + 2161 Pred->getName()); 2162 Builder.SetInsertPoint(CaseBB); 2163 Builder.CreateBr(CleanupPadBB); 2164 movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB); 2165 2166 // Update this Pred to the new unwind point. 2167 setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB); 2168 2169 // Setup the switch in the dispatcher. 2170 auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex); 2171 SetDispatchValuePN->addIncoming(SwitchConstant, Pred); 2172 SwitchOnDispatch->addCase(SwitchConstant, CaseBB); 2173 SwitchIndex++; 2174 } 2175 } 2176 2177 static void cleanupSinglePredPHIs(Function &F) { 2178 SmallVector<PHINode *, 32> Worklist; 2179 for (auto &BB : F) { 2180 for (auto &Phi : BB.phis()) { 2181 if (Phi.getNumIncomingValues() == 1) { 2182 Worklist.push_back(&Phi); 2183 } else 2184 break; 2185 } 2186 } 2187 while (!Worklist.empty()) { 2188 auto *Phi = Worklist.pop_back_val(); 2189 auto *OriginalValue = Phi->getIncomingValue(0); 2190 Phi->replaceAllUsesWith(OriginalValue); 2191 } 2192 } 2193 2194 static void rewritePHIs(BasicBlock &BB) { 2195 // For every incoming edge we will create a block holding all 2196 // incoming values in a single PHI nodes. 2197 // 2198 // loop: 2199 // %n.val = phi i32[%n, %entry], [%inc, %loop] 2200 // 2201 // It will create: 2202 // 2203 // loop.from.entry: 2204 // %n.loop.pre = phi i32 [%n, %entry] 2205 // br %label loop 2206 // loop.from.loop: 2207 // %inc.loop.pre = phi i32 [%inc, %loop] 2208 // br %label loop 2209 // 2210 // After this rewrite, further analysis will ignore any phi nodes with more 2211 // than one incoming edge. 2212 2213 // TODO: Simplify PHINodes in the basic block to remove duplicate 2214 // predecessors. 2215 2216 // Special case for CleanupPad: all EH blocks must have the same unwind edge 2217 // so we need to create an additional "dispatcher" block. 2218 if (auto *CleanupPad = 2219 dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) { 2220 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2221 for (BasicBlock *Pred : Preds) { 2222 if (CatchSwitchInst *CS = 2223 dyn_cast<CatchSwitchInst>(Pred->getTerminator())) { 2224 // CleanupPad with a CatchSwitch predecessor: therefore this is an 2225 // unwind destination that needs to be handle specially. 2226 assert(CS->getUnwindDest() == &BB); 2227 (void)CS; 2228 rewritePHIsForCleanupPad(&BB, CleanupPad); 2229 return; 2230 } 2231 } 2232 } 2233 2234 LandingPadInst *LandingPad = nullptr; 2235 PHINode *ReplPHI = nullptr; 2236 if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { 2237 // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. 2238 // We replace the original landing pad with a PHINode that will collect the 2239 // results from all of them. 2240 ReplPHI = PHINode::Create(LandingPad->getType(), 1, ""); 2241 ReplPHI->insertBefore(LandingPad->getIterator()); 2242 ReplPHI->takeName(LandingPad); 2243 LandingPad->replaceAllUsesWith(ReplPHI); 2244 // We will erase the original landing pad at the end of this function after 2245 // ehAwareSplitEdge cloned it in the transition blocks. 2246 } 2247 2248 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2249 for (BasicBlock *Pred : Preds) { 2250 auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); 2251 IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); 2252 2253 // Stop the moving of values at ReplPHI, as this is either null or the PHI 2254 // that replaced the landing pad. 2255 movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI); 2256 } 2257 2258 if (LandingPad) { 2259 // Calls to ehAwareSplitEdge function cloned the original lading pad. 2260 // No longer need it. 2261 LandingPad->eraseFromParent(); 2262 } 2263 } 2264 2265 static void rewritePHIs(Function &F) { 2266 SmallVector<BasicBlock *, 8> WorkList; 2267 2268 for (BasicBlock &BB : F) 2269 if (auto *PN = dyn_cast<PHINode>(&BB.front())) 2270 if (PN->getNumIncomingValues() > 1) 2271 WorkList.push_back(&BB); 2272 2273 for (BasicBlock *BB : WorkList) 2274 rewritePHIs(*BB); 2275 } 2276 2277 /// Default materializable callback 2278 // Check for instructions that we can recreate on resume as opposed to spill 2279 // the result into a coroutine frame. 2280 bool coro::defaultMaterializable(Instruction &V) { 2281 return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || 2282 isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); 2283 } 2284 2285 // Check for structural coroutine intrinsics that should not be spilled into 2286 // the coroutine frame. 2287 static bool isCoroutineStructureIntrinsic(Instruction &I) { 2288 return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) || 2289 isa<CoroSuspendInst>(&I); 2290 } 2291 2292 // For each instruction identified as materializable across the suspend point, 2293 // and its associated DAG of other rematerializable instructions, 2294 // recreate the DAG of instructions after the suspend point. 2295 static void rewriteMaterializableInstructions( 2296 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> 2297 &AllRemats) { 2298 // This has to be done in 2 phases 2299 // Do the remats and record the required defs to be replaced in the 2300 // original use instructions 2301 // Once all the remats are complete, replace the uses in the final 2302 // instructions with the new defs 2303 typedef struct { 2304 Instruction *Use; 2305 Instruction *Def; 2306 Instruction *Remat; 2307 } ProcessNode; 2308 2309 SmallVector<ProcessNode> FinalInstructionsToProcess; 2310 2311 for (const auto &E : AllRemats) { 2312 Instruction *Use = E.first; 2313 Instruction *CurrentMaterialization = nullptr; 2314 RematGraph *RG = E.second.get(); 2315 ReversePostOrderTraversal<RematGraph *> RPOT(RG); 2316 SmallVector<Instruction *> InstructionsToProcess; 2317 2318 // If the target use is actually a suspend instruction then we have to 2319 // insert the remats into the end of the predecessor (there should only be 2320 // one). This is so that suspend blocks always have the suspend instruction 2321 // as the first instruction. 2322 auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); 2323 if (isa<AnyCoroSuspendInst>(Use)) { 2324 BasicBlock *SuspendPredecessorBlock = 2325 Use->getParent()->getSinglePredecessor(); 2326 assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); 2327 InsertPoint = SuspendPredecessorBlock->getTerminator(); 2328 } 2329 2330 // Note: skip the first instruction as this is the actual use that we're 2331 // rematerializing everything for. 2332 auto I = RPOT.begin(); 2333 ++I; 2334 for (; I != RPOT.end(); ++I) { 2335 Instruction *D = (*I)->Node; 2336 CurrentMaterialization = D->clone(); 2337 CurrentMaterialization->setName(D->getName()); 2338 CurrentMaterialization->insertBefore(InsertPoint); 2339 InsertPoint = CurrentMaterialization; 2340 2341 // Replace all uses of Def in the instructions being added as part of this 2342 // rematerialization group 2343 for (auto &I : InstructionsToProcess) 2344 I->replaceUsesOfWith(D, CurrentMaterialization); 2345 2346 // Don't replace the final use at this point as this can cause problems 2347 // for other materializations. Instead, for any final use that uses a 2348 // define that's being rematerialized, record the replace values 2349 for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) 2350 if (Use->getOperand(i) == D) // Is this operand pointing to oldval? 2351 FinalInstructionsToProcess.push_back( 2352 {Use, D, CurrentMaterialization}); 2353 2354 InstructionsToProcess.push_back(CurrentMaterialization); 2355 } 2356 } 2357 2358 // Finally, replace the uses with the defines that we've just rematerialized 2359 for (auto &R : FinalInstructionsToProcess) { 2360 if (auto *PN = dyn_cast<PHINode>(R.Use)) { 2361 assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " 2362 "values in the PHINode"); 2363 PN->replaceAllUsesWith(R.Remat); 2364 PN->eraseFromParent(); 2365 continue; 2366 } 2367 R.Use->replaceUsesOfWith(R.Def, R.Remat); 2368 } 2369 } 2370 2371 // Splits the block at a particular instruction unless it is the first 2372 // instruction in the block with a single predecessor. 2373 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { 2374 auto *BB = I->getParent(); 2375 if (&BB->front() == I) { 2376 if (BB->getSinglePredecessor()) { 2377 BB->setName(Name); 2378 return BB; 2379 } 2380 } 2381 return BB->splitBasicBlock(I, Name); 2382 } 2383 2384 // Split above and below a particular instruction so that it 2385 // will be all alone by itself in a block. 2386 static void splitAround(Instruction *I, const Twine &Name) { 2387 splitBlockIfNotFirst(I, Name); 2388 splitBlockIfNotFirst(I->getNextNode(), "After" + Name); 2389 } 2390 2391 static bool isSuspendBlock(BasicBlock *BB) { 2392 return isa<AnyCoroSuspendInst>(BB->front()); 2393 } 2394 2395 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; 2396 2397 /// Does control flow starting at the given block ever reach a suspend 2398 /// instruction before reaching a block in VisitedOrFreeBBs? 2399 static bool isSuspendReachableFrom(BasicBlock *From, 2400 VisitedBlocksSet &VisitedOrFreeBBs) { 2401 // Eagerly try to add this block to the visited set. If it's already 2402 // there, stop recursing; this path doesn't reach a suspend before 2403 // either looping or reaching a freeing block. 2404 if (!VisitedOrFreeBBs.insert(From).second) 2405 return false; 2406 2407 // We assume that we'll already have split suspends into their own blocks. 2408 if (isSuspendBlock(From)) 2409 return true; 2410 2411 // Recurse on the successors. 2412 for (auto *Succ : successors(From)) { 2413 if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) 2414 return true; 2415 } 2416 2417 return false; 2418 } 2419 2420 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a 2421 /// suspend point? 2422 static bool isLocalAlloca(CoroAllocaAllocInst *AI) { 2423 // Seed the visited set with all the basic blocks containing a free 2424 // so that we won't pass them up. 2425 VisitedBlocksSet VisitedOrFreeBBs; 2426 for (auto *User : AI->users()) { 2427 if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) 2428 VisitedOrFreeBBs.insert(FI->getParent()); 2429 } 2430 2431 return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); 2432 } 2433 2434 /// After we split the coroutine, will the given basic block be along 2435 /// an obvious exit path for the resumption function? 2436 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, 2437 unsigned depth = 3) { 2438 // If we've bottomed out our depth count, stop searching and assume 2439 // that the path might loop back. 2440 if (depth == 0) return false; 2441 2442 // If this is a suspend block, we're about to exit the resumption function. 2443 if (isSuspendBlock(BB)) return true; 2444 2445 // Recurse into the successors. 2446 for (auto *Succ : successors(BB)) { 2447 if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) 2448 return false; 2449 } 2450 2451 // If none of the successors leads back in a loop, we're on an exit/abort. 2452 return true; 2453 } 2454 2455 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { 2456 // Look for a free that isn't sufficiently obviously followed by 2457 // either a suspend or a termination, i.e. something that will leave 2458 // the coro resumption frame. 2459 for (auto *U : AI->users()) { 2460 auto FI = dyn_cast<CoroAllocaFreeInst>(U); 2461 if (!FI) continue; 2462 2463 if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) 2464 return true; 2465 } 2466 2467 // If we never found one, we don't need a stack save. 2468 return false; 2469 } 2470 2471 /// Turn each of the given local allocas into a normal (dynamic) alloca 2472 /// instruction. 2473 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, 2474 SmallVectorImpl<Instruction*> &DeadInsts) { 2475 for (auto *AI : LocalAllocas) { 2476 IRBuilder<> Builder(AI); 2477 2478 // Save the stack depth. Try to avoid doing this if the stackrestore 2479 // is going to immediately precede a return or something. 2480 Value *StackSave = nullptr; 2481 if (localAllocaNeedsStackSave(AI)) 2482 StackSave = Builder.CreateStackSave(); 2483 2484 // Allocate memory. 2485 auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); 2486 Alloca->setAlignment(AI->getAlignment()); 2487 2488 for (auto *U : AI->users()) { 2489 // Replace gets with the allocation. 2490 if (isa<CoroAllocaGetInst>(U)) { 2491 U->replaceAllUsesWith(Alloca); 2492 2493 // Replace frees with stackrestores. This is safe because 2494 // alloca.alloc is required to obey a stack discipline, although we 2495 // don't enforce that structurally. 2496 } else { 2497 auto FI = cast<CoroAllocaFreeInst>(U); 2498 if (StackSave) { 2499 Builder.SetInsertPoint(FI); 2500 Builder.CreateStackRestore(StackSave); 2501 } 2502 } 2503 DeadInsts.push_back(cast<Instruction>(U)); 2504 } 2505 2506 DeadInsts.push_back(AI); 2507 } 2508 } 2509 2510 /// Turn the given coro.alloca.alloc call into a dynamic allocation. 2511 /// This happens during the all-instructions iteration, so it must not 2512 /// delete the call. 2513 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, 2514 coro::Shape &Shape, 2515 SmallVectorImpl<Instruction*> &DeadInsts) { 2516 IRBuilder<> Builder(AI); 2517 auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); 2518 2519 for (User *U : AI->users()) { 2520 if (isa<CoroAllocaGetInst>(U)) { 2521 U->replaceAllUsesWith(Alloc); 2522 } else { 2523 auto FI = cast<CoroAllocaFreeInst>(U); 2524 Builder.SetInsertPoint(FI); 2525 Shape.emitDealloc(Builder, Alloc, nullptr); 2526 } 2527 DeadInsts.push_back(cast<Instruction>(U)); 2528 } 2529 2530 // Push this on last so that it gets deleted after all the others. 2531 DeadInsts.push_back(AI); 2532 2533 // Return the new allocation value so that we can check for needed spills. 2534 return cast<Instruction>(Alloc); 2535 } 2536 2537 /// Get the current swifterror value. 2538 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, 2539 coro::Shape &Shape) { 2540 // Make a fake function pointer as a sort of intrinsic. 2541 auto FnTy = FunctionType::get(ValueTy, {}, false); 2542 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2543 2544 auto Call = Builder.CreateCall(FnTy, Fn, {}); 2545 Shape.SwiftErrorOps.push_back(Call); 2546 2547 return Call; 2548 } 2549 2550 /// Set the given value as the current swifterror value. 2551 /// 2552 /// Returns a slot that can be used as a swifterror slot. 2553 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, 2554 coro::Shape &Shape) { 2555 // Make a fake function pointer as a sort of intrinsic. 2556 auto FnTy = FunctionType::get(Builder.getPtrTy(), 2557 {V->getType()}, false); 2558 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2559 2560 auto Call = Builder.CreateCall(FnTy, Fn, { V }); 2561 Shape.SwiftErrorOps.push_back(Call); 2562 2563 return Call; 2564 } 2565 2566 /// Set the swifterror value from the given alloca before a call, 2567 /// then put in back in the alloca afterwards. 2568 /// 2569 /// Returns an address that will stand in for the swifterror slot 2570 /// until splitting. 2571 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, 2572 AllocaInst *Alloca, 2573 coro::Shape &Shape) { 2574 auto ValueTy = Alloca->getAllocatedType(); 2575 IRBuilder<> Builder(Call); 2576 2577 // Load the current value from the alloca and set it as the 2578 // swifterror value. 2579 auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); 2580 auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); 2581 2582 // Move to after the call. Since swifterror only has a guaranteed 2583 // value on normal exits, we can ignore implicit and explicit unwind 2584 // edges. 2585 if (isa<CallInst>(Call)) { 2586 Builder.SetInsertPoint(Call->getNextNode()); 2587 } else { 2588 auto Invoke = cast<InvokeInst>(Call); 2589 Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); 2590 } 2591 2592 // Get the current swifterror value and store it to the alloca. 2593 auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); 2594 Builder.CreateStore(ValueAfterCall, Alloca); 2595 2596 return Addr; 2597 } 2598 2599 /// Eliminate a formerly-swifterror alloca by inserting the get/set 2600 /// intrinsics and attempting to MemToReg the alloca away. 2601 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, 2602 coro::Shape &Shape) { 2603 for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) { 2604 // swifterror values can only be used in very specific ways. 2605 // We take advantage of that here. 2606 auto User = Use.getUser(); 2607 if (isa<LoadInst>(User) || isa<StoreInst>(User)) 2608 continue; 2609 2610 assert(isa<CallInst>(User) || isa<InvokeInst>(User)); 2611 auto Call = cast<Instruction>(User); 2612 2613 auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); 2614 2615 // Use the returned slot address as the call argument. 2616 Use.set(Addr); 2617 } 2618 2619 // All the uses should be loads and stores now. 2620 assert(isAllocaPromotable(Alloca)); 2621 } 2622 2623 /// "Eliminate" a swifterror argument by reducing it to the alloca case 2624 /// and then loading and storing in the prologue and epilog. 2625 /// 2626 /// The argument keeps the swifterror flag. 2627 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, 2628 coro::Shape &Shape, 2629 SmallVectorImpl<AllocaInst*> &AllocasToPromote) { 2630 IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); 2631 2632 auto ArgTy = cast<PointerType>(Arg.getType()); 2633 auto ValueTy = PointerType::getUnqual(F.getContext()); 2634 2635 // Reduce to the alloca case: 2636 2637 // Create an alloca and replace all uses of the arg with it. 2638 auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); 2639 Arg.replaceAllUsesWith(Alloca); 2640 2641 // Set an initial value in the alloca. swifterror is always null on entry. 2642 auto InitialValue = Constant::getNullValue(ValueTy); 2643 Builder.CreateStore(InitialValue, Alloca); 2644 2645 // Find all the suspends in the function and save and restore around them. 2646 for (auto *Suspend : Shape.CoroSuspends) { 2647 (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); 2648 } 2649 2650 // Find all the coro.ends in the function and restore the error value. 2651 for (auto *End : Shape.CoroEnds) { 2652 Builder.SetInsertPoint(End); 2653 auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); 2654 (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); 2655 } 2656 2657 // Now we can use the alloca logic. 2658 AllocasToPromote.push_back(Alloca); 2659 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2660 } 2661 2662 /// Eliminate all problematic uses of swifterror arguments and allocas 2663 /// from the function. We'll fix them up later when splitting the function. 2664 static void eliminateSwiftError(Function &F, coro::Shape &Shape) { 2665 SmallVector<AllocaInst*, 4> AllocasToPromote; 2666 2667 // Look for a swifterror argument. 2668 for (auto &Arg : F.args()) { 2669 if (!Arg.hasSwiftErrorAttr()) continue; 2670 2671 eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); 2672 break; 2673 } 2674 2675 // Look for swifterror allocas. 2676 for (auto &Inst : F.getEntryBlock()) { 2677 auto Alloca = dyn_cast<AllocaInst>(&Inst); 2678 if (!Alloca || !Alloca->isSwiftError()) continue; 2679 2680 // Clear the swifterror flag. 2681 Alloca->setSwiftError(false); 2682 2683 AllocasToPromote.push_back(Alloca); 2684 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2685 } 2686 2687 // If we have any allocas to promote, compute a dominator tree and 2688 // promote them en masse. 2689 if (!AllocasToPromote.empty()) { 2690 DominatorTree DT(F); 2691 PromoteMemToReg(AllocasToPromote, DT); 2692 } 2693 } 2694 2695 /// retcon and retcon.once conventions assume that all spill uses can be sunk 2696 /// after the coro.begin intrinsic. 2697 static void sinkSpillUsesAfterCoroBegin(Function &F, 2698 const FrameDataInfo &FrameData, 2699 CoroBeginInst *CoroBegin) { 2700 DominatorTree Dom(F); 2701 2702 SmallSetVector<Instruction *, 32> ToMove; 2703 SmallVector<Instruction *, 32> Worklist; 2704 2705 // Collect all users that precede coro.begin. 2706 for (auto *Def : FrameData.getAllDefs()) { 2707 for (User *U : Def->users()) { 2708 auto Inst = cast<Instruction>(U); 2709 if (Inst->getParent() != CoroBegin->getParent() || 2710 Dom.dominates(CoroBegin, Inst)) 2711 continue; 2712 if (ToMove.insert(Inst)) 2713 Worklist.push_back(Inst); 2714 } 2715 } 2716 // Recursively collect users before coro.begin. 2717 while (!Worklist.empty()) { 2718 auto *Def = Worklist.pop_back_val(); 2719 for (User *U : Def->users()) { 2720 auto Inst = cast<Instruction>(U); 2721 if (Dom.dominates(CoroBegin, Inst)) 2722 continue; 2723 if (ToMove.insert(Inst)) 2724 Worklist.push_back(Inst); 2725 } 2726 } 2727 2728 // Sort by dominance. 2729 SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end()); 2730 llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool { 2731 // If a dominates b it should preceed (<) b. 2732 return Dom.dominates(A, B); 2733 }); 2734 2735 Instruction *InsertPt = CoroBegin->getNextNode(); 2736 for (Instruction *Inst : InsertionList) 2737 Inst->moveBefore(InsertPt); 2738 } 2739 2740 /// For each local variable that all of its user are only used inside one of 2741 /// suspended region, we sink their lifetime.start markers to the place where 2742 /// after the suspend block. Doing so minimizes the lifetime of each variable, 2743 /// hence minimizing the amount of data we end up putting on the frame. 2744 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, 2745 SuspendCrossingInfo &Checker, 2746 const DominatorTree &DT) { 2747 if (F.hasOptNone()) 2748 return; 2749 2750 // Collect all possible basic blocks which may dominate all uses of allocas. 2751 SmallPtrSet<BasicBlock *, 4> DomSet; 2752 DomSet.insert(&F.getEntryBlock()); 2753 for (auto *CSI : Shape.CoroSuspends) { 2754 BasicBlock *SuspendBlock = CSI->getParent(); 2755 assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && 2756 "should have split coro.suspend into its own block"); 2757 DomSet.insert(SuspendBlock->getSingleSuccessor()); 2758 } 2759 2760 for (Instruction &I : instructions(F)) { 2761 AllocaInst* AI = dyn_cast<AllocaInst>(&I); 2762 if (!AI) 2763 continue; 2764 2765 for (BasicBlock *DomBB : DomSet) { 2766 bool Valid = true; 2767 SmallVector<Instruction *, 1> Lifetimes; 2768 2769 auto isLifetimeStart = [](Instruction* I) { 2770 if (auto* II = dyn_cast<IntrinsicInst>(I)) 2771 return II->getIntrinsicID() == Intrinsic::lifetime_start; 2772 return false; 2773 }; 2774 2775 auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) { 2776 if (isLifetimeStart(U)) { 2777 Lifetimes.push_back(U); 2778 return true; 2779 } 2780 if (!U->hasOneUse() || U->stripPointerCasts() != AI) 2781 return false; 2782 if (isLifetimeStart(U->user_back())) { 2783 Lifetimes.push_back(U->user_back()); 2784 return true; 2785 } 2786 return false; 2787 }; 2788 2789 for (User *U : AI->users()) { 2790 Instruction *UI = cast<Instruction>(U); 2791 // For all users except lifetime.start markers, if they are all 2792 // dominated by one of the basic blocks and do not cross 2793 // suspend points as well, then there is no need to spill the 2794 // instruction. 2795 if (!DT.dominates(DomBB, UI->getParent()) || 2796 Checker.isDefinitionAcrossSuspend(DomBB, UI)) { 2797 // Skip lifetime.start, GEP and bitcast used by lifetime.start 2798 // markers. 2799 if (collectLifetimeStart(UI, AI)) 2800 continue; 2801 Valid = false; 2802 break; 2803 } 2804 } 2805 // Sink lifetime.start markers to dominate block when they are 2806 // only used outside the region. 2807 if (Valid && Lifetimes.size() != 0) { 2808 auto *NewLifetime = Lifetimes[0]->clone(); 2809 NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); 2810 NewLifetime->insertBefore(DomBB->getTerminator()); 2811 2812 // All the outsided lifetime.start markers are no longer necessary. 2813 for (Instruction *S : Lifetimes) 2814 S->eraseFromParent(); 2815 2816 break; 2817 } 2818 } 2819 } 2820 } 2821 2822 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, 2823 const SuspendCrossingInfo &Checker, 2824 SmallVectorImpl<AllocaInfo> &Allocas, 2825 const DominatorTree &DT) { 2826 if (Shape.CoroSuspends.empty()) 2827 return; 2828 2829 // The PromiseAlloca will be specially handled since it needs to be in a 2830 // fixed position in the frame. 2831 if (AI == Shape.SwitchLowering.PromiseAlloca) 2832 return; 2833 2834 // The __coro_gro alloca should outlive the promise, make sure we 2835 // keep it outside the frame. 2836 if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame)) 2837 return; 2838 2839 // The code that uses lifetime.start intrinsic does not work for functions 2840 // with loops without exit. Disable it on ABIs we know to generate such 2841 // code. 2842 bool ShouldUseLifetimeStartInfo = 2843 (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 2844 Shape.ABI != coro::ABI::RetconOnce); 2845 AllocaUseVisitor Visitor{AI->getDataLayout(), DT, Shape, Checker, 2846 ShouldUseLifetimeStartInfo}; 2847 Visitor.visitPtr(*AI); 2848 if (!Visitor.getShouldLiveOnFrame()) 2849 return; 2850 Allocas.emplace_back(AI, Visitor.getAliasesCopy(), 2851 Visitor.getMayWriteBeforeCoroBegin()); 2852 } 2853 2854 static std::optional<std::pair<Value &, DIExpression &>> 2855 salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2856 bool OptimizeFrame, bool UseEntryValue, Function *F, 2857 Value *Storage, DIExpression *Expr, 2858 bool SkipOutermostLoad) { 2859 IRBuilder<> Builder(F->getContext()); 2860 auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); 2861 while (isa<IntrinsicInst>(InsertPt)) 2862 ++InsertPt; 2863 Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt); 2864 2865 while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { 2866 if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { 2867 Storage = LdInst->getPointerOperand(); 2868 // FIXME: This is a heuristic that works around the fact that 2869 // LLVM IR debug intrinsics cannot yet distinguish between 2870 // memory and value locations: Because a dbg.declare(alloca) is 2871 // implicitly a memory location no DW_OP_deref operation for the 2872 // last direct load from an alloca is necessary. This condition 2873 // effectively drops the *last* DW_OP_deref in the expression. 2874 if (!SkipOutermostLoad) 2875 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2876 } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { 2877 Storage = StInst->getValueOperand(); 2878 } else { 2879 SmallVector<uint64_t, 16> Ops; 2880 SmallVector<Value *, 0> AdditionalValues; 2881 Value *Op = llvm::salvageDebugInfoImpl( 2882 *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops, 2883 AdditionalValues); 2884 if (!Op || !AdditionalValues.empty()) { 2885 // If salvaging failed or salvaging produced more than one location 2886 // operand, give up. 2887 break; 2888 } 2889 Storage = Op; 2890 Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false); 2891 } 2892 SkipOutermostLoad = false; 2893 } 2894 if (!Storage) 2895 return std::nullopt; 2896 2897 auto *StorageAsArg = dyn_cast<Argument>(Storage); 2898 const bool IsSwiftAsyncArg = 2899 StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); 2900 2901 // Swift async arguments are described by an entry value of the ABI-defined 2902 // register containing the coroutine context. 2903 // Entry values in variadic expressions are not supported. 2904 if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() && 2905 Expr->isSingleLocationExpression()) 2906 Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); 2907 2908 // If the coroutine frame is an Argument, store it in an alloca to improve 2909 // its availability (e.g. registers may be clobbered). 2910 // Avoid this if optimizations are enabled (they would remove the alloca) or 2911 // if the value is guaranteed to be available through other means (e.g. swift 2912 // ABI guarantees). 2913 if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { 2914 auto &Cached = ArgToAllocaMap[StorageAsArg]; 2915 if (!Cached) { 2916 Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, 2917 Storage->getName() + ".debug"); 2918 Builder.CreateStore(Storage, Cached); 2919 } 2920 Storage = Cached; 2921 // FIXME: LLVM lacks nuanced semantics to differentiate between 2922 // memory and direct locations at the IR level. The backend will 2923 // turn a dbg.declare(alloca, ..., DIExpression()) into a memory 2924 // location. Thus, if there are deref and offset operations in the 2925 // expression, we need to add a DW_OP_deref at the *start* of the 2926 // expression to first load the contents of the alloca before 2927 // adjusting it with the expression. 2928 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2929 } 2930 2931 return {{*Storage, *Expr}}; 2932 } 2933 2934 void coro::salvageDebugInfo( 2935 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2936 DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) { 2937 2938 Function *F = DVI.getFunction(); 2939 // Follow the pointer arithmetic all the way to the incoming 2940 // function argument and convert into a DIExpression. 2941 bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); 2942 Value *OriginalStorage = DVI.getVariableLocationOp(0); 2943 2944 auto SalvagedInfo = ::salvageDebugInfoImpl( 2945 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2946 DVI.getExpression(), SkipOutermostLoad); 2947 if (!SalvagedInfo) 2948 return; 2949 2950 Value *Storage = &SalvagedInfo->first; 2951 DIExpression *Expr = &SalvagedInfo->second; 2952 2953 DVI.replaceVariableLocationOp(OriginalStorage, Storage); 2954 DVI.setExpression(Expr); 2955 // We only hoist dbg.declare today since it doesn't make sense to hoist 2956 // dbg.value since it does not have the same function wide guarantees that 2957 // dbg.declare does. 2958 if (isa<DbgDeclareInst>(DVI)) { 2959 std::optional<BasicBlock::iterator> InsertPt; 2960 if (auto *I = dyn_cast<Instruction>(Storage)) { 2961 InsertPt = I->getInsertionPointAfterDef(); 2962 // Update DILocation only if variable was not inlined. 2963 DebugLoc ILoc = I->getDebugLoc(); 2964 DebugLoc DVILoc = DVI.getDebugLoc(); 2965 if (ILoc && DVILoc && 2966 DVILoc->getScope()->getSubprogram() == 2967 ILoc->getScope()->getSubprogram()) 2968 DVI.setDebugLoc(I->getDebugLoc()); 2969 } else if (isa<Argument>(Storage)) 2970 InsertPt = F->getEntryBlock().begin(); 2971 if (InsertPt) 2972 DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); 2973 } 2974 } 2975 2976 void coro::salvageDebugInfo( 2977 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2978 DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) { 2979 2980 Function *F = DVR.getFunction(); 2981 // Follow the pointer arithmetic all the way to the incoming 2982 // function argument and convert into a DIExpression. 2983 bool SkipOutermostLoad = DVR.isDbgDeclare(); 2984 Value *OriginalStorage = DVR.getVariableLocationOp(0); 2985 2986 auto SalvagedInfo = ::salvageDebugInfoImpl( 2987 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2988 DVR.getExpression(), SkipOutermostLoad); 2989 if (!SalvagedInfo) 2990 return; 2991 2992 Value *Storage = &SalvagedInfo->first; 2993 DIExpression *Expr = &SalvagedInfo->second; 2994 2995 DVR.replaceVariableLocationOp(OriginalStorage, Storage); 2996 DVR.setExpression(Expr); 2997 // We only hoist dbg.declare today since it doesn't make sense to hoist 2998 // dbg.value since it does not have the same function wide guarantees that 2999 // dbg.declare does. 3000 if (DVR.getType() == DbgVariableRecord::LocationType::Declare) { 3001 std::optional<BasicBlock::iterator> InsertPt; 3002 if (auto *I = dyn_cast<Instruction>(Storage)) { 3003 InsertPt = I->getInsertionPointAfterDef(); 3004 // Update DILocation only if variable was not inlined. 3005 DebugLoc ILoc = I->getDebugLoc(); 3006 DebugLoc DVRLoc = DVR.getDebugLoc(); 3007 if (ILoc && DVRLoc && 3008 DVRLoc->getScope()->getSubprogram() == 3009 ILoc->getScope()->getSubprogram()) 3010 DVR.setDebugLoc(ILoc); 3011 } else if (isa<Argument>(Storage)) 3012 InsertPt = F->getEntryBlock().begin(); 3013 if (InsertPt) { 3014 DVR.removeFromParent(); 3015 (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt); 3016 } 3017 } 3018 } 3019 3020 static void doRematerializations( 3021 Function &F, SuspendCrossingInfo &Checker, 3022 const std::function<bool(Instruction &)> &MaterializableCallback) { 3023 if (F.hasOptNone()) 3024 return; 3025 3026 SpillInfo Spills; 3027 3028 // See if there are materializable instructions across suspend points 3029 // We record these as the starting point to also identify materializable 3030 // defs of uses in these operations 3031 for (Instruction &I : instructions(F)) { 3032 if (!MaterializableCallback(I)) 3033 continue; 3034 for (User *U : I.users()) 3035 if (Checker.isDefinitionAcrossSuspend(I, U)) 3036 Spills[&I].push_back(cast<Instruction>(U)); 3037 } 3038 3039 // Process each of the identified rematerializable instructions 3040 // and add predecessor instructions that can also be rematerialized. 3041 // This is actually a graph of instructions since we could potentially 3042 // have multiple uses of a def in the set of predecessor instructions. 3043 // The approach here is to maintain a graph of instructions for each bottom 3044 // level instruction - where we have a unique set of instructions (nodes) 3045 // and edges between them. We then walk the graph in reverse post-dominator 3046 // order to insert them past the suspend point, but ensure that ordering is 3047 // correct. We also rely on CSE removing duplicate defs for remats of 3048 // different instructions with a def in common (rather than maintaining more 3049 // complex graphs for each suspend point) 3050 3051 // We can do this by adding new nodes to the list for each suspend 3052 // point. Then using standard GraphTraits to give a reverse post-order 3053 // traversal when we insert the nodes after the suspend 3054 SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; 3055 for (auto &E : Spills) { 3056 for (Instruction *U : E.second) { 3057 // Don't process a user twice (this can happen if the instruction uses 3058 // more than one rematerializable def) 3059 if (AllRemats.count(U)) 3060 continue; 3061 3062 // Constructor creates the whole RematGraph for the given Use 3063 auto RematUPtr = 3064 std::make_unique<RematGraph>(MaterializableCallback, U, Checker); 3065 3066 LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; 3067 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); 3068 for (auto I = RPOT.begin(); I != RPOT.end(); 3069 ++I) { (*I)->Node->dump(); } dbgs() 3070 << "\n";); 3071 3072 AllRemats[U] = std::move(RematUPtr); 3073 } 3074 } 3075 3076 // Rewrite materializable instructions to be materialized at the use 3077 // point. 3078 LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); 3079 rewriteMaterializableInstructions(AllRemats); 3080 } 3081 3082 void coro::buildCoroutineFrame( 3083 Function &F, Shape &Shape, TargetTransformInfo &TTI, 3084 const std::function<bool(Instruction &)> &MaterializableCallback) { 3085 // Don't eliminate swifterror in async functions that won't be split. 3086 if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) 3087 eliminateSwiftError(F, Shape); 3088 3089 if (Shape.ABI == coro::ABI::Switch && 3090 Shape.SwitchLowering.PromiseAlloca) { 3091 Shape.getSwitchCoroId()->clearPromise(); 3092 } 3093 3094 // Make sure that all coro.save, coro.suspend and the fallthrough coro.end 3095 // intrinsics are in their own blocks to simplify the logic of building up 3096 // SuspendCrossing data. 3097 for (auto *CSI : Shape.CoroSuspends) { 3098 if (auto *Save = CSI->getCoroSave()) 3099 splitAround(Save, "CoroSave"); 3100 splitAround(CSI, "CoroSuspend"); 3101 } 3102 3103 // Put CoroEnds into their own blocks. 3104 for (AnyCoroEndInst *CE : Shape.CoroEnds) { 3105 splitAround(CE, "CoroEnd"); 3106 3107 // Emit the musttail call function in a new block before the CoroEnd. 3108 // We do this here so that the right suspend crossing info is computed for 3109 // the uses of the musttail call function call. (Arguments to the coro.end 3110 // instructions would be ignored) 3111 if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) { 3112 auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction(); 3113 if (!MustTailCallFn) 3114 continue; 3115 IRBuilder<> Builder(AsyncEnd); 3116 SmallVector<Value *, 8> Args(AsyncEnd->args()); 3117 auto Arguments = ArrayRef<Value *>(Args).drop_front(3); 3118 auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, 3119 TTI, Arguments, Builder); 3120 splitAround(Call, "MustTailCall.Before.CoroEnd"); 3121 } 3122 } 3123 3124 // Later code makes structural assumptions about single predecessors phis e.g 3125 // that they are not live across a suspend point. 3126 cleanupSinglePredPHIs(F); 3127 3128 // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will 3129 // never has its definition separated from the PHI by the suspend point. 3130 rewritePHIs(F); 3131 3132 // Build suspend crossing info. 3133 SuspendCrossingInfo Checker(F, Shape); 3134 3135 doRematerializations(F, Checker, MaterializableCallback); 3136 3137 const DominatorTree DT(F); 3138 FrameDataInfo FrameData; 3139 SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; 3140 SmallVector<Instruction*, 4> DeadInstructions; 3141 if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 3142 Shape.ABI != coro::ABI::RetconOnce) 3143 sinkLifetimeStartMarkers(F, Shape, Checker, DT); 3144 3145 // Collect the spills for arguments and other not-materializable values. 3146 for (Argument &A : F.args()) 3147 for (User *U : A.users()) 3148 if (Checker.isDefinitionAcrossSuspend(A, U)) 3149 FrameData.Spills[&A].push_back(cast<Instruction>(U)); 3150 3151 for (Instruction &I : instructions(F)) { 3152 // Values returned from coroutine structure intrinsics should not be part 3153 // of the Coroutine Frame. 3154 if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) 3155 continue; 3156 3157 // Handle alloca.alloc specially here. 3158 if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { 3159 // Check whether the alloca's lifetime is bounded by suspend points. 3160 if (isLocalAlloca(AI)) { 3161 LocalAllocas.push_back(AI); 3162 continue; 3163 } 3164 3165 // If not, do a quick rewrite of the alloca and then add spills of 3166 // the rewritten value. The rewrite doesn't invalidate anything in 3167 // Spills because the other alloca intrinsics have no other operands 3168 // besides AI, and it doesn't invalidate the iteration because we delay 3169 // erasing AI. 3170 auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); 3171 3172 for (User *U : Alloc->users()) { 3173 if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) 3174 FrameData.Spills[Alloc].push_back(cast<Instruction>(U)); 3175 } 3176 continue; 3177 } 3178 3179 // Ignore alloca.get; we process this as part of coro.alloca.alloc. 3180 if (isa<CoroAllocaGetInst>(I)) 3181 continue; 3182 3183 if (auto *AI = dyn_cast<AllocaInst>(&I)) { 3184 collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT); 3185 continue; 3186 } 3187 3188 for (User *U : I.users()) 3189 if (Checker.isDefinitionAcrossSuspend(I, U)) { 3190 // We cannot spill a token. 3191 if (I.getType()->isTokenTy()) 3192 report_fatal_error( 3193 "token definition is separated from the use by a suspend point"); 3194 FrameData.Spills[&I].push_back(cast<Instruction>(U)); 3195 } 3196 } 3197 3198 LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); 3199 3200 // We don't want the layout of coroutine frame to be affected 3201 // by debug information. So we only choose to salvage DbgValueInst for 3202 // whose value is already in the frame. 3203 // We would handle the dbg.values for allocas specially 3204 for (auto &Iter : FrameData.Spills) { 3205 auto *V = Iter.first; 3206 SmallVector<DbgValueInst *, 16> DVIs; 3207 SmallVector<DbgVariableRecord *, 16> DVRs; 3208 findDbgValues(DVIs, V, &DVRs); 3209 for (DbgValueInst *DVI : DVIs) 3210 if (Checker.isDefinitionAcrossSuspend(*V, DVI)) 3211 FrameData.Spills[V].push_back(DVI); 3212 // Add the instructions which carry debug info that is in the frame. 3213 for (DbgVariableRecord *DVR : DVRs) 3214 if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr)) 3215 FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr); 3216 } 3217 3218 LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills)); 3219 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 3220 Shape.ABI == coro::ABI::Async) 3221 sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin); 3222 Shape.FrameTy = buildFrameType(F, Shape, FrameData); 3223 Shape.FramePtr = Shape.CoroBegin; 3224 // For now, this works for C++ programs only. 3225 buildFrameDebugInfo(F, Shape, FrameData); 3226 insertSpills(FrameData, Shape); 3227 lowerLocalAllocas(LocalAllocas, DeadInstructions); 3228 3229 for (auto *I : DeadInstructions) 3230 I->eraseFromParent(); 3231 } 3232