1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // This file contains classes used to discover if for a particular value 9 // there from sue to definition that crosses a suspend block. 10 // 11 // Using the information discovered we form a Coroutine Frame structure to 12 // contain those values. All uses of those values are replaced with appropriate 13 // GEP + load from the coroutine frame. At the point of the definition we spill 14 // the value into the coroutine frame. 15 //===----------------------------------------------------------------------===// 16 17 #include "CoroInternal.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/ADT/SmallString.h" 22 #include "llvm/Analysis/CFG.h" 23 #include "llvm/Analysis/PtrUseVisitor.h" 24 #include "llvm/Analysis/StackLifetime.h" 25 #include "llvm/Config/llvm-config.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DIBuilder.h" 28 #include "llvm/IR/DebugInfo.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstIterator.h" 32 #include "llvm/IR/IntrinsicInst.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/MathExtras.h" 35 #include "llvm/Support/OptimizedStructLayout.h" 36 #include "llvm/Support/circular_raw_ostream.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include "llvm/Transforms/Utils/Local.h" 40 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 41 #include <algorithm> 42 #include <deque> 43 #include <optional> 44 45 using namespace llvm; 46 47 extern cl::opt<bool> UseNewDbgInfoFormat; 48 49 // The "coro-suspend-crossing" flag is very noisy. There is another debug type, 50 // "coro-frame", which results in leaner debug spew. 51 #define DEBUG_TYPE "coro-suspend-crossing" 52 53 enum { SmallVectorThreshold = 32 }; 54 55 // Provides two way mapping between the blocks and numbers. 56 namespace { 57 class BlockToIndexMapping { 58 SmallVector<BasicBlock *, SmallVectorThreshold> V; 59 60 public: 61 size_t size() const { return V.size(); } 62 63 BlockToIndexMapping(Function &F) { 64 for (BasicBlock &BB : F) 65 V.push_back(&BB); 66 llvm::sort(V); 67 } 68 69 size_t blockToIndex(BasicBlock const *BB) const { 70 auto *I = llvm::lower_bound(V, BB); 71 assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block"); 72 return I - V.begin(); 73 } 74 75 BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; } 76 }; 77 } // end anonymous namespace 78 79 // The SuspendCrossingInfo maintains data that allows to answer a question 80 // whether given two BasicBlocks A and B there is a path from A to B that 81 // passes through a suspend point. 82 // 83 // For every basic block 'i' it maintains a BlockData that consists of: 84 // Consumes: a bit vector which contains a set of indices of blocks that can 85 // reach block 'i'. A block can trivially reach itself. 86 // Kills: a bit vector which contains a set of indices of blocks that can 87 // reach block 'i' but there is a path crossing a suspend point 88 // not repeating 'i' (path to 'i' without cycles containing 'i'). 89 // Suspend: a boolean indicating whether block 'i' contains a suspend point. 90 // End: a boolean indicating whether block 'i' contains a coro.end intrinsic. 91 // KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that 92 // crosses a suspend point. 93 // 94 namespace { 95 class SuspendCrossingInfo { 96 BlockToIndexMapping Mapping; 97 98 struct BlockData { 99 BitVector Consumes; 100 BitVector Kills; 101 bool Suspend = false; 102 bool End = false; 103 bool KillLoop = false; 104 bool Changed = false; 105 }; 106 SmallVector<BlockData, SmallVectorThreshold> Block; 107 108 iterator_range<pred_iterator> predecessors(BlockData const &BD) const { 109 BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); 110 return llvm::predecessors(BB); 111 } 112 113 BlockData &getBlockData(BasicBlock *BB) { 114 return Block[Mapping.blockToIndex(BB)]; 115 } 116 117 /// Compute the BlockData for the current function in one iteration. 118 /// Initialize - Whether this is the first iteration, we can optimize 119 /// the initial case a little bit by manual loop switch. 120 /// Returns whether the BlockData changes in this iteration. 121 template <bool Initialize = false> 122 bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT); 123 124 public: 125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 126 void dump() const; 127 void dump(StringRef Label, BitVector const &BV) const; 128 #endif 129 130 SuspendCrossingInfo(Function &F, coro::Shape &Shape); 131 132 /// Returns true if there is a path from \p From to \p To crossing a suspend 133 /// point without crossing \p From a 2nd time. 134 bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const { 135 size_t const FromIndex = Mapping.blockToIndex(From); 136 size_t const ToIndex = Mapping.blockToIndex(To); 137 bool const Result = Block[ToIndex].Kills[FromIndex]; 138 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 139 << " answer is " << Result << "\n"); 140 return Result; 141 } 142 143 /// Returns true if there is a path from \p From to \p To crossing a suspend 144 /// point without crossing \p From a 2nd time. If \p From is the same as \p To 145 /// this will also check if there is a looping path crossing a suspend point. 146 bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From, 147 BasicBlock *To) const { 148 size_t const FromIndex = Mapping.blockToIndex(From); 149 size_t const ToIndex = Mapping.blockToIndex(To); 150 bool Result = Block[ToIndex].Kills[FromIndex] || 151 (From == To && Block[ToIndex].KillLoop); 152 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 153 << " answer is " << Result << " (path or loop)\n"); 154 return Result; 155 } 156 157 bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const { 158 auto *I = cast<Instruction>(U); 159 160 // We rewrote PHINodes, so that only the ones with exactly one incoming 161 // value need to be analyzed. 162 if (auto *PN = dyn_cast<PHINode>(I)) 163 if (PN->getNumIncomingValues() > 1) 164 return false; 165 166 BasicBlock *UseBB = I->getParent(); 167 168 // As a special case, treat uses by an llvm.coro.suspend.retcon or an 169 // llvm.coro.suspend.async as if they were uses in the suspend's single 170 // predecessor: the uses conceptually occur before the suspend. 171 if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) { 172 UseBB = UseBB->getSinglePredecessor(); 173 assert(UseBB && "should have split coro.suspend into its own block"); 174 } 175 176 return hasPathCrossingSuspendPoint(DefBB, UseBB); 177 } 178 179 bool isDefinitionAcrossSuspend(Argument &A, User *U) const { 180 return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U); 181 } 182 183 bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { 184 auto *DefBB = I.getParent(); 185 186 // As a special case, treat values produced by an llvm.coro.suspend.* 187 // as if they were defined in the single successor: the uses 188 // conceptually occur after the suspend. 189 if (isa<AnyCoroSuspendInst>(I)) { 190 DefBB = DefBB->getSingleSuccessor(); 191 assert(DefBB && "should have split coro.suspend into its own block"); 192 } 193 194 return isDefinitionAcrossSuspend(DefBB, U); 195 } 196 197 bool isDefinitionAcrossSuspend(Value &V, User *U) const { 198 if (auto *Arg = dyn_cast<Argument>(&V)) 199 return isDefinitionAcrossSuspend(*Arg, U); 200 if (auto *Inst = dyn_cast<Instruction>(&V)) 201 return isDefinitionAcrossSuspend(*Inst, U); 202 203 llvm_unreachable( 204 "Coroutine could only collect Argument and Instruction now."); 205 } 206 }; 207 } // end anonymous namespace 208 209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 210 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label, 211 BitVector const &BV) const { 212 dbgs() << Label << ":"; 213 for (size_t I = 0, N = BV.size(); I < N; ++I) 214 if (BV[I]) 215 dbgs() << " " << Mapping.indexToBlock(I)->getName(); 216 dbgs() << "\n"; 217 } 218 219 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { 220 for (size_t I = 0, N = Block.size(); I < N; ++I) { 221 BasicBlock *const B = Mapping.indexToBlock(I); 222 dbgs() << B->getName() << ":\n"; 223 dump(" Consumes", Block[I].Consumes); 224 dump(" Kills", Block[I].Kills); 225 } 226 dbgs() << "\n"; 227 } 228 #endif 229 230 template <bool Initialize> 231 bool SuspendCrossingInfo::computeBlockData( 232 const ReversePostOrderTraversal<Function *> &RPOT) { 233 bool Changed = false; 234 235 for (const BasicBlock *BB : RPOT) { 236 auto BBNo = Mapping.blockToIndex(BB); 237 auto &B = Block[BBNo]; 238 239 // We don't need to count the predecessors when initialization. 240 if constexpr (!Initialize) 241 // If all the predecessors of the current Block don't change, 242 // the BlockData for the current block must not change too. 243 if (all_of(predecessors(B), [this](BasicBlock *BB) { 244 return !Block[Mapping.blockToIndex(BB)].Changed; 245 })) { 246 B.Changed = false; 247 continue; 248 } 249 250 // Saved Consumes and Kills bitsets so that it is easy to see 251 // if anything changed after propagation. 252 auto SavedConsumes = B.Consumes; 253 auto SavedKills = B.Kills; 254 255 for (BasicBlock *PI : predecessors(B)) { 256 auto PrevNo = Mapping.blockToIndex(PI); 257 auto &P = Block[PrevNo]; 258 259 // Propagate Kills and Consumes from predecessors into B. 260 B.Consumes |= P.Consumes; 261 B.Kills |= P.Kills; 262 263 // If block P is a suspend block, it should propagate kills into block 264 // B for every block P consumes. 265 if (P.Suspend) 266 B.Kills |= P.Consumes; 267 } 268 269 if (B.Suspend) { 270 // If block B is a suspend block, it should kill all of the blocks it 271 // consumes. 272 B.Kills |= B.Consumes; 273 } else if (B.End) { 274 // If block B is an end block, it should not propagate kills as the 275 // blocks following coro.end() are reached during initial invocation 276 // of the coroutine while all the data are still available on the 277 // stack or in the registers. 278 B.Kills.reset(); 279 } else { 280 // This is reached when B block it not Suspend nor coro.end and it 281 // need to make sure that it is not in the kill set. 282 B.KillLoop |= B.Kills[BBNo]; 283 B.Kills.reset(BBNo); 284 } 285 286 if constexpr (!Initialize) { 287 B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); 288 Changed |= B.Changed; 289 } 290 } 291 292 return Changed; 293 } 294 295 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) 296 : Mapping(F) { 297 const size_t N = Mapping.size(); 298 Block.resize(N); 299 300 // Initialize every block so that it consumes itself 301 for (size_t I = 0; I < N; ++I) { 302 auto &B = Block[I]; 303 B.Consumes.resize(N); 304 B.Kills.resize(N); 305 B.Consumes.set(I); 306 B.Changed = true; 307 } 308 309 // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as 310 // the code beyond coro.end is reachable during initial invocation of the 311 // coroutine. 312 for (auto *CE : Shape.CoroEnds) 313 getBlockData(CE->getParent()).End = true; 314 315 // Mark all suspend blocks and indicate that they kill everything they 316 // consume. Note, that crossing coro.save also requires a spill, as any code 317 // between coro.save and coro.suspend may resume the coroutine and all of the 318 // state needs to be saved by that time. 319 auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { 320 BasicBlock *SuspendBlock = BarrierInst->getParent(); 321 auto &B = getBlockData(SuspendBlock); 322 B.Suspend = true; 323 B.Kills |= B.Consumes; 324 }; 325 for (auto *CSI : Shape.CoroSuspends) { 326 markSuspendBlock(CSI); 327 if (auto *Save = CSI->getCoroSave()) 328 markSuspendBlock(Save); 329 } 330 331 // It is considered to be faster to use RPO traversal for forward-edges 332 // dataflow analysis. 333 ReversePostOrderTraversal<Function *> RPOT(&F); 334 computeBlockData</*Initialize=*/true>(RPOT); 335 while (computeBlockData</*Initialize*/ false>(RPOT)) 336 ; 337 338 LLVM_DEBUG(dump()); 339 } 340 341 namespace { 342 343 // RematGraph is used to construct a DAG for rematerializable instructions 344 // When the constructor is invoked with a candidate instruction (which is 345 // materializable) it builds a DAG of materializable instructions from that 346 // point. 347 // Typically, for each instruction identified as re-materializable across a 348 // suspend point, a RematGraph will be created. 349 struct RematGraph { 350 // Each RematNode in the graph contains the edges to instructions providing 351 // operands in the current node. 352 struct RematNode { 353 Instruction *Node; 354 SmallVector<RematNode *> Operands; 355 RematNode() = default; 356 RematNode(Instruction *V) : Node(V) {} 357 }; 358 359 RematNode *EntryNode; 360 using RematNodeMap = 361 SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; 362 RematNodeMap Remats; 363 const std::function<bool(Instruction &)> &MaterializableCallback; 364 SuspendCrossingInfo &Checker; 365 366 RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, 367 Instruction *I, SuspendCrossingInfo &Checker) 368 : MaterializableCallback(MaterializableCallback), Checker(Checker) { 369 std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); 370 EntryNode = FirstNode.get(); 371 std::deque<std::unique_ptr<RematNode>> WorkList; 372 addNode(std::move(FirstNode), WorkList, cast<User>(I)); 373 while (WorkList.size()) { 374 std::unique_ptr<RematNode> N = std::move(WorkList.front()); 375 WorkList.pop_front(); 376 addNode(std::move(N), WorkList, cast<User>(I)); 377 } 378 } 379 380 void addNode(std::unique_ptr<RematNode> NUPtr, 381 std::deque<std::unique_ptr<RematNode>> &WorkList, 382 User *FirstUse) { 383 RematNode *N = NUPtr.get(); 384 if (Remats.count(N->Node)) 385 return; 386 387 // We haven't see this node yet - add to the list 388 Remats[N->Node] = std::move(NUPtr); 389 for (auto &Def : N->Node->operands()) { 390 Instruction *D = dyn_cast<Instruction>(Def.get()); 391 if (!D || !MaterializableCallback(*D) || 392 !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) 393 continue; 394 395 if (Remats.count(D)) { 396 // Already have this in the graph 397 N->Operands.push_back(Remats[D].get()); 398 continue; 399 } 400 401 bool NoMatch = true; 402 for (auto &I : WorkList) { 403 if (I->Node == D) { 404 NoMatch = false; 405 N->Operands.push_back(I.get()); 406 break; 407 } 408 } 409 if (NoMatch) { 410 // Create a new node 411 std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); 412 N->Operands.push_back(ChildNode.get()); 413 WorkList.push_back(std::move(ChildNode)); 414 } 415 } 416 } 417 418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 419 void dump() const { 420 dbgs() << "Entry ("; 421 if (EntryNode->Node->getParent()->hasName()) 422 dbgs() << EntryNode->Node->getParent()->getName(); 423 else 424 EntryNode->Node->getParent()->printAsOperand(dbgs(), false); 425 dbgs() << ") : " << *EntryNode->Node << "\n"; 426 for (auto &E : Remats) { 427 dbgs() << *(E.first) << "\n"; 428 for (RematNode *U : E.second->Operands) 429 dbgs() << " " << *U->Node << "\n"; 430 } 431 } 432 #endif 433 }; 434 } // end anonymous namespace 435 436 namespace llvm { 437 438 template <> struct GraphTraits<RematGraph *> { 439 using NodeRef = RematGraph::RematNode *; 440 using ChildIteratorType = RematGraph::RematNode **; 441 442 static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } 443 static ChildIteratorType child_begin(NodeRef N) { 444 return N->Operands.begin(); 445 } 446 static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } 447 }; 448 449 } // end namespace llvm 450 451 #undef DEBUG_TYPE // "coro-suspend-crossing" 452 #define DEBUG_TYPE "coro-frame" 453 454 namespace { 455 class FrameTypeBuilder; 456 // Mapping from the to-be-spilled value to all the users that need reload. 457 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>; 458 struct AllocaInfo { 459 AllocaInst *Alloca; 460 DenseMap<Instruction *, std::optional<APInt>> Aliases; 461 bool MayWriteBeforeCoroBegin; 462 AllocaInfo(AllocaInst *Alloca, 463 DenseMap<Instruction *, std::optional<APInt>> Aliases, 464 bool MayWriteBeforeCoroBegin) 465 : Alloca(Alloca), Aliases(std::move(Aliases)), 466 MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} 467 }; 468 struct FrameDataInfo { 469 // All the values (that are not allocas) that needs to be spilled to the 470 // frame. 471 SpillInfo Spills; 472 // Allocas contains all values defined as allocas that need to live in the 473 // frame. 474 SmallVector<AllocaInfo, 8> Allocas; 475 476 SmallVector<Value *, 8> getAllDefs() const { 477 SmallVector<Value *, 8> Defs; 478 for (const auto &P : Spills) 479 Defs.push_back(P.first); 480 for (const auto &A : Allocas) 481 Defs.push_back(A.Alloca); 482 return Defs; 483 } 484 485 uint32_t getFieldIndex(Value *V) const { 486 auto Itr = FieldIndexMap.find(V); 487 assert(Itr != FieldIndexMap.end() && 488 "Value does not have a frame field index"); 489 return Itr->second; 490 } 491 492 void setFieldIndex(Value *V, uint32_t Index) { 493 assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) && 494 "Cannot set the index for the same field twice."); 495 FieldIndexMap[V] = Index; 496 } 497 498 Align getAlign(Value *V) const { 499 auto Iter = FieldAlignMap.find(V); 500 assert(Iter != FieldAlignMap.end()); 501 return Iter->second; 502 } 503 504 void setAlign(Value *V, Align AL) { 505 assert(FieldAlignMap.count(V) == 0); 506 FieldAlignMap.insert({V, AL}); 507 } 508 509 uint64_t getDynamicAlign(Value *V) const { 510 auto Iter = FieldDynamicAlignMap.find(V); 511 assert(Iter != FieldDynamicAlignMap.end()); 512 return Iter->second; 513 } 514 515 void setDynamicAlign(Value *V, uint64_t Align) { 516 assert(FieldDynamicAlignMap.count(V) == 0); 517 FieldDynamicAlignMap.insert({V, Align}); 518 } 519 520 uint64_t getOffset(Value *V) const { 521 auto Iter = FieldOffsetMap.find(V); 522 assert(Iter != FieldOffsetMap.end()); 523 return Iter->second; 524 } 525 526 void setOffset(Value *V, uint64_t Offset) { 527 assert(FieldOffsetMap.count(V) == 0); 528 FieldOffsetMap.insert({V, Offset}); 529 } 530 531 // Remap the index of every field in the frame, using the final layout index. 532 void updateLayoutIndex(FrameTypeBuilder &B); 533 534 private: 535 // LayoutIndexUpdateStarted is used to avoid updating the index of any field 536 // twice by mistake. 537 bool LayoutIndexUpdateStarted = false; 538 // Map from values to their slot indexes on the frame. They will be first set 539 // with their original insertion field index. After the frame is built, their 540 // indexes will be updated into the final layout index. 541 DenseMap<Value *, uint32_t> FieldIndexMap; 542 // Map from values to their alignment on the frame. They would be set after 543 // the frame is built. 544 DenseMap<Value *, Align> FieldAlignMap; 545 DenseMap<Value *, uint64_t> FieldDynamicAlignMap; 546 // Map from values to their offset on the frame. They would be set after 547 // the frame is built. 548 DenseMap<Value *, uint64_t> FieldOffsetMap; 549 }; 550 } // namespace 551 552 #ifndef NDEBUG 553 static void dumpSpills(StringRef Title, const SpillInfo &Spills) { 554 dbgs() << "------------- " << Title << "--------------\n"; 555 for (const auto &E : Spills) { 556 E.first->dump(); 557 dbgs() << " user: "; 558 for (auto *I : E.second) 559 I->dump(); 560 } 561 } 562 static void dumpRemats( 563 StringRef Title, 564 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { 565 dbgs() << "------------- " << Title << "--------------\n"; 566 for (const auto &E : RM) { 567 E.second->dump(); 568 dbgs() << "--\n"; 569 } 570 } 571 572 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { 573 dbgs() << "------------- Allocas --------------\n"; 574 for (const auto &A : Allocas) { 575 A.Alloca->dump(); 576 } 577 } 578 #endif 579 580 namespace { 581 using FieldIDType = size_t; 582 // We cannot rely solely on natural alignment of a type when building a 583 // coroutine frame and if the alignment specified on the Alloca instruction 584 // differs from the natural alignment of the alloca type we will need to insert 585 // padding. 586 class FrameTypeBuilder { 587 private: 588 struct Field { 589 uint64_t Size; 590 uint64_t Offset; 591 Type *Ty; 592 FieldIDType LayoutFieldIndex; 593 Align Alignment; 594 Align TyAlignment; 595 uint64_t DynamicAlignBuffer; 596 }; 597 598 const DataLayout &DL; 599 LLVMContext &Context; 600 uint64_t StructSize = 0; 601 Align StructAlign; 602 bool IsFinished = false; 603 604 std::optional<Align> MaxFrameAlignment; 605 606 SmallVector<Field, 8> Fields; 607 DenseMap<Value*, unsigned> FieldIndexByKey; 608 609 public: 610 FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL, 611 std::optional<Align> MaxFrameAlignment) 612 : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {} 613 614 /// Add a field to this structure for the storage of an `alloca` 615 /// instruction. 616 [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI, 617 bool IsHeader = false) { 618 Type *Ty = AI->getAllocatedType(); 619 620 // Make an array type if this is a static array allocation. 621 if (AI->isArrayAllocation()) { 622 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 623 Ty = ArrayType::get(Ty, CI->getValue().getZExtValue()); 624 else 625 report_fatal_error("Coroutines cannot handle non static allocas yet"); 626 } 627 628 return addField(Ty, AI->getAlign(), IsHeader); 629 } 630 631 /// We want to put the allocas whose lifetime-ranges are not overlapped 632 /// into one slot of coroutine frame. 633 /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566 634 /// 635 /// cppcoro::task<void> alternative_paths(bool cond) { 636 /// if (cond) { 637 /// big_structure a; 638 /// process(a); 639 /// co_await something(); 640 /// } else { 641 /// big_structure b; 642 /// process2(b); 643 /// co_await something(); 644 /// } 645 /// } 646 /// 647 /// We want to put variable a and variable b in the same slot to 648 /// reduce the size of coroutine frame. 649 /// 650 /// This function use StackLifetime algorithm to partition the AllocaInsts in 651 /// Spills to non-overlapped sets in order to put Alloca in the same 652 /// non-overlapped set into the same slot in the Coroutine Frame. Then add 653 /// field for the allocas in the same non-overlapped set by using the largest 654 /// type as the field type. 655 /// 656 /// Side Effects: Because We sort the allocas, the order of allocas in the 657 /// frame may be different with the order in the source code. 658 void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData, 659 coro::Shape &Shape); 660 661 /// Add a field to this structure. 662 [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment, 663 bool IsHeader = false, 664 bool IsSpillOfValue = false) { 665 assert(!IsFinished && "adding fields to a finished builder"); 666 assert(Ty && "must provide a type for a field"); 667 668 // The field size is always the alloc size of the type. 669 uint64_t FieldSize = DL.getTypeAllocSize(Ty); 670 671 // For an alloca with size=0, we don't need to add a field and they 672 // can just point to any index in the frame. Use index 0. 673 if (FieldSize == 0) { 674 return 0; 675 } 676 677 // The field alignment might not be the type alignment, but we need 678 // to remember the type alignment anyway to build the type. 679 // If we are spilling values we don't need to worry about ABI alignment 680 // concerns. 681 Align ABIAlign = DL.getABITypeAlign(Ty); 682 Align TyAlignment = ABIAlign; 683 if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign) 684 TyAlignment = *MaxFrameAlignment; 685 Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment); 686 687 // The field alignment could be bigger than the max frame case, in that case 688 // we request additional storage to be able to dynamically align the 689 // pointer. 690 uint64_t DynamicAlignBuffer = 0; 691 if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) { 692 DynamicAlignBuffer = 693 offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment); 694 FieldAlignment = *MaxFrameAlignment; 695 FieldSize = FieldSize + DynamicAlignBuffer; 696 } 697 698 // Lay out header fields immediately. 699 uint64_t Offset; 700 if (IsHeader) { 701 Offset = alignTo(StructSize, FieldAlignment); 702 StructSize = Offset + FieldSize; 703 704 // Everything else has a flexible offset. 705 } else { 706 Offset = OptimizedStructLayoutField::FlexibleOffset; 707 } 708 709 Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment, 710 DynamicAlignBuffer}); 711 return Fields.size() - 1; 712 } 713 714 /// Finish the layout and set the body on the given type. 715 void finish(StructType *Ty); 716 717 uint64_t getStructSize() const { 718 assert(IsFinished && "not yet finished!"); 719 return StructSize; 720 } 721 722 Align getStructAlign() const { 723 assert(IsFinished && "not yet finished!"); 724 return StructAlign; 725 } 726 727 FieldIDType getLayoutFieldIndex(FieldIDType Id) const { 728 assert(IsFinished && "not yet finished!"); 729 return Fields[Id].LayoutFieldIndex; 730 } 731 732 Field getLayoutField(FieldIDType Id) const { 733 assert(IsFinished && "not yet finished!"); 734 return Fields[Id]; 735 } 736 }; 737 } // namespace 738 739 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) { 740 auto Updater = [&](Value *I) { 741 auto Field = B.getLayoutField(getFieldIndex(I)); 742 setFieldIndex(I, Field.LayoutFieldIndex); 743 setAlign(I, Field.Alignment); 744 uint64_t dynamicAlign = 745 Field.DynamicAlignBuffer 746 ? Field.DynamicAlignBuffer + Field.Alignment.value() 747 : 0; 748 setDynamicAlign(I, dynamicAlign); 749 setOffset(I, Field.Offset); 750 }; 751 LayoutIndexUpdateStarted = true; 752 for (auto &S : Spills) 753 Updater(S.first); 754 for (const auto &A : Allocas) 755 Updater(A.Alloca); 756 LayoutIndexUpdateStarted = false; 757 } 758 759 void FrameTypeBuilder::addFieldForAllocas(const Function &F, 760 FrameDataInfo &FrameData, 761 coro::Shape &Shape) { 762 using AllocaSetType = SmallVector<AllocaInst *, 4>; 763 SmallVector<AllocaSetType, 4> NonOverlapedAllocas; 764 765 // We need to add field for allocas at the end of this function. 766 auto AddFieldForAllocasAtExit = make_scope_exit([&]() { 767 for (auto AllocaList : NonOverlapedAllocas) { 768 auto *LargestAI = *AllocaList.begin(); 769 FieldIDType Id = addFieldForAlloca(LargestAI); 770 for (auto *Alloca : AllocaList) 771 FrameData.setFieldIndex(Alloca, Id); 772 } 773 }); 774 775 if (!Shape.OptimizeFrame) { 776 for (const auto &A : FrameData.Allocas) { 777 AllocaInst *Alloca = A.Alloca; 778 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 779 } 780 return; 781 } 782 783 // Because there are paths from the lifetime.start to coro.end 784 // for each alloca, the liferanges for every alloca is overlaped 785 // in the blocks who contain coro.end and the successor blocks. 786 // So we choose to skip there blocks when we calculate the liferange 787 // for each alloca. It should be reasonable since there shouldn't be uses 788 // in these blocks and the coroutine frame shouldn't be used outside the 789 // coroutine body. 790 // 791 // Note that the user of coro.suspend may not be SwitchInst. However, this 792 // case seems too complex to handle. And it is harmless to skip these 793 // patterns since it just prevend putting the allocas to live in the same 794 // slot. 795 DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest; 796 for (auto *CoroSuspendInst : Shape.CoroSuspends) { 797 for (auto *U : CoroSuspendInst->users()) { 798 if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) { 799 auto *SWI = const_cast<SwitchInst *>(ConstSWI); 800 DefaultSuspendDest[SWI] = SWI->getDefaultDest(); 801 SWI->setDefaultDest(SWI->getSuccessor(1)); 802 } 803 } 804 } 805 806 auto ExtractAllocas = [&]() { 807 AllocaSetType Allocas; 808 Allocas.reserve(FrameData.Allocas.size()); 809 for (const auto &A : FrameData.Allocas) 810 Allocas.push_back(A.Alloca); 811 return Allocas; 812 }; 813 StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(), 814 StackLifetime::LivenessType::May); 815 StackLifetimeAnalyzer.run(); 816 auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) { 817 return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps( 818 StackLifetimeAnalyzer.getLiveRange(AI2)); 819 }; 820 auto GetAllocaSize = [&](const AllocaInfo &A) { 821 std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL); 822 assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n"); 823 assert(!RetSize->isScalable() && "Scalable vectors are not yet supported"); 824 return RetSize->getFixedValue(); 825 }; 826 // Put larger allocas in the front. So the larger allocas have higher 827 // priority to merge, which can save more space potentially. Also each 828 // AllocaSet would be ordered. So we can get the largest Alloca in one 829 // AllocaSet easily. 830 sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) { 831 return GetAllocaSize(Iter1) > GetAllocaSize(Iter2); 832 }); 833 for (const auto &A : FrameData.Allocas) { 834 AllocaInst *Alloca = A.Alloca; 835 bool Merged = false; 836 // Try to find if the Alloca is not inferenced with any existing 837 // NonOverlappedAllocaSet. If it is true, insert the alloca to that 838 // NonOverlappedAllocaSet. 839 for (auto &AllocaSet : NonOverlapedAllocas) { 840 assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n"); 841 bool NoInference = none_of(AllocaSet, [&](auto Iter) { 842 return IsAllocaInferenre(Alloca, Iter); 843 }); 844 // If the alignment of A is multiple of the alignment of B, the address 845 // of A should satisfy the requirement for aligning for B. 846 // 847 // There may be other more fine-grained strategies to handle the alignment 848 // infomation during the merging process. But it seems hard to handle 849 // these strategies and benefit little. 850 bool Alignable = [&]() -> bool { 851 auto *LargestAlloca = *AllocaSet.begin(); 852 return LargestAlloca->getAlign().value() % Alloca->getAlign().value() == 853 0; 854 }(); 855 bool CouldMerge = NoInference && Alignable; 856 if (!CouldMerge) 857 continue; 858 AllocaSet.push_back(Alloca); 859 Merged = true; 860 break; 861 } 862 if (!Merged) { 863 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 864 } 865 } 866 // Recover the default target destination for each Switch statement 867 // reserved. 868 for (auto SwitchAndDefaultDest : DefaultSuspendDest) { 869 SwitchInst *SWI = SwitchAndDefaultDest.first; 870 BasicBlock *DestBB = SwitchAndDefaultDest.second; 871 SWI->setDefaultDest(DestBB); 872 } 873 // This Debug Info could tell us which allocas are merged into one slot. 874 LLVM_DEBUG(for (auto &AllocaSet 875 : NonOverlapedAllocas) { 876 if (AllocaSet.size() > 1) { 877 dbgs() << "In Function:" << F.getName() << "\n"; 878 dbgs() << "Find Union Set " 879 << "\n"; 880 dbgs() << "\tAllocas are \n"; 881 for (auto Alloca : AllocaSet) 882 dbgs() << "\t\t" << *Alloca << "\n"; 883 } 884 }); 885 } 886 887 void FrameTypeBuilder::finish(StructType *Ty) { 888 assert(!IsFinished && "already finished!"); 889 890 // Prepare the optimal-layout field array. 891 // The Id in the layout field is a pointer to our Field for it. 892 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 893 LayoutFields.reserve(Fields.size()); 894 for (auto &Field : Fields) { 895 LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment, 896 Field.Offset); 897 } 898 899 // Perform layout. 900 auto SizeAndAlign = performOptimizedStructLayout(LayoutFields); 901 StructSize = SizeAndAlign.first; 902 StructAlign = SizeAndAlign.second; 903 904 auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & { 905 return *static_cast<Field *>(const_cast<void*>(LayoutField.Id)); 906 }; 907 908 // We need to produce a packed struct type if there's a field whose 909 // assigned offset isn't a multiple of its natural type alignment. 910 bool Packed = [&] { 911 for (auto &LayoutField : LayoutFields) { 912 auto &F = getField(LayoutField); 913 if (!isAligned(F.TyAlignment, LayoutField.Offset)) 914 return true; 915 } 916 return false; 917 }(); 918 919 // Build the struct body. 920 SmallVector<Type*, 16> FieldTypes; 921 FieldTypes.reserve(LayoutFields.size() * 3 / 2); 922 uint64_t LastOffset = 0; 923 for (auto &LayoutField : LayoutFields) { 924 auto &F = getField(LayoutField); 925 926 auto Offset = LayoutField.Offset; 927 928 // Add a padding field if there's a padding gap and we're either 929 // building a packed struct or the padding gap is more than we'd 930 // get from aligning to the field type's natural alignment. 931 assert(Offset >= LastOffset); 932 if (Offset != LastOffset) { 933 if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset) 934 FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context), 935 Offset - LastOffset)); 936 } 937 938 F.Offset = Offset; 939 F.LayoutFieldIndex = FieldTypes.size(); 940 941 FieldTypes.push_back(F.Ty); 942 if (F.DynamicAlignBuffer) { 943 FieldTypes.push_back( 944 ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); 945 } 946 LastOffset = Offset + F.Size; 947 } 948 949 Ty->setBody(FieldTypes, Packed); 950 951 #ifndef NDEBUG 952 // Check that the IR layout matches the offsets we expect. 953 auto Layout = DL.getStructLayout(Ty); 954 for (auto &F : Fields) { 955 assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty); 956 assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset); 957 } 958 #endif 959 960 IsFinished = true; 961 } 962 963 static void cacheDIVar(FrameDataInfo &FrameData, 964 DenseMap<Value *, DILocalVariable *> &DIVarCache) { 965 for (auto *V : FrameData.getAllDefs()) { 966 if (DIVarCache.contains(V)) 967 continue; 968 969 auto CacheIt = [&DIVarCache, V](const auto &Container) { 970 auto *I = llvm::find_if(Container, [](auto *DDI) { 971 return DDI->getExpression()->getNumElements() == 0; 972 }); 973 if (I != Container.end()) 974 DIVarCache.insert({V, (*I)->getVariable()}); 975 }; 976 CacheIt(findDbgDeclares(V)); 977 CacheIt(findDVRDeclares(V)); 978 } 979 } 980 981 /// Create name for Type. It uses MDString to store new created string to 982 /// avoid memory leak. 983 static StringRef solveTypeName(Type *Ty) { 984 if (Ty->isIntegerTy()) { 985 // The longest name in common may be '__int_128', which has 9 bits. 986 SmallString<16> Buffer; 987 raw_svector_ostream OS(Buffer); 988 OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth(); 989 auto *MDName = MDString::get(Ty->getContext(), OS.str()); 990 return MDName->getString(); 991 } 992 993 if (Ty->isFloatingPointTy()) { 994 if (Ty->isFloatTy()) 995 return "__float_"; 996 if (Ty->isDoubleTy()) 997 return "__double_"; 998 return "__floating_type_"; 999 } 1000 1001 if (Ty->isPointerTy()) 1002 return "PointerType"; 1003 1004 if (Ty->isStructTy()) { 1005 if (!cast<StructType>(Ty)->hasName()) 1006 return "__LiteralStructType_"; 1007 1008 auto Name = Ty->getStructName(); 1009 1010 SmallString<16> Buffer(Name); 1011 for (auto &Iter : Buffer) 1012 if (Iter == '.' || Iter == ':') 1013 Iter = '_'; 1014 auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); 1015 return MDName->getString(); 1016 } 1017 1018 return "UnknownType"; 1019 } 1020 1021 static DIType *solveDIType(DIBuilder &Builder, Type *Ty, 1022 const DataLayout &Layout, DIScope *Scope, 1023 unsigned LineNum, 1024 DenseMap<Type *, DIType *> &DITypeCache) { 1025 if (DIType *DT = DITypeCache.lookup(Ty)) 1026 return DT; 1027 1028 StringRef Name = solveTypeName(Ty); 1029 1030 DIType *RetType = nullptr; 1031 1032 if (Ty->isIntegerTy()) { 1033 auto BitWidth = cast<IntegerType>(Ty)->getBitWidth(); 1034 RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed, 1035 llvm::DINode::FlagArtificial); 1036 } else if (Ty->isFloatingPointTy()) { 1037 RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), 1038 dwarf::DW_ATE_float, 1039 llvm::DINode::FlagArtificial); 1040 } else if (Ty->isPointerTy()) { 1041 // Construct PointerType points to null (aka void *) instead of exploring 1042 // pointee type to avoid infinite search problem. For example, we would be 1043 // in trouble if we traverse recursively: 1044 // 1045 // struct Node { 1046 // Node* ptr; 1047 // }; 1048 RetType = 1049 Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), 1050 Layout.getABITypeAlign(Ty).value() * CHAR_BIT, 1051 /*DWARFAddressSpace=*/std::nullopt, Name); 1052 } else if (Ty->isStructTy()) { 1053 auto *DIStruct = Builder.createStructType( 1054 Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), 1055 Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT, 1056 llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); 1057 1058 auto *StructTy = cast<StructType>(Ty); 1059 SmallVector<Metadata *, 16> Elements; 1060 for (unsigned I = 0; I < StructTy->getNumElements(); I++) { 1061 DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout, 1062 Scope, LineNum, DITypeCache); 1063 assert(DITy); 1064 Elements.push_back(Builder.createMemberType( 1065 Scope, DITy->getName(), Scope->getFile(), LineNum, 1066 DITy->getSizeInBits(), DITy->getAlignInBits(), 1067 Layout.getStructLayout(StructTy)->getElementOffsetInBits(I), 1068 llvm::DINode::FlagArtificial, DITy)); 1069 } 1070 1071 Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements)); 1072 1073 RetType = DIStruct; 1074 } else { 1075 LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); 1076 TypeSize Size = Layout.getTypeSizeInBits(Ty); 1077 auto *CharSizeType = Builder.createBasicType( 1078 Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); 1079 1080 if (Size <= 8) 1081 RetType = CharSizeType; 1082 else { 1083 if (Size % 8 != 0) 1084 Size = TypeSize::getFixed(Size + 8 - (Size % 8)); 1085 1086 RetType = Builder.createArrayType( 1087 Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, 1088 Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); 1089 } 1090 } 1091 1092 DITypeCache.insert({Ty, RetType}); 1093 return RetType; 1094 } 1095 1096 /// Build artificial debug info for C++ coroutine frames to allow users to 1097 /// inspect the contents of the frame directly 1098 /// 1099 /// Create Debug information for coroutine frame with debug name "__coro_frame". 1100 /// The debug information for the fields of coroutine frame is constructed from 1101 /// the following way: 1102 /// 1. For all the value in the Frame, we search the use of dbg.declare to find 1103 /// the corresponding debug variables for the value. If we can find the 1104 /// debug variable, we can get full and accurate debug information. 1105 /// 2. If we can't get debug information in step 1 and 2, we could only try to 1106 /// build the DIType by Type. We did this in solveDIType. We only handle 1107 /// integer, float, double, integer type and struct type for now. 1108 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, 1109 FrameDataInfo &FrameData) { 1110 DISubprogram *DIS = F.getSubprogram(); 1111 // If there is no DISubprogram for F, it implies the Function are not compiled 1112 // with debug info. So we also don't need to generate debug info for the frame 1113 // neither. 1114 if (!DIS || !DIS->getUnit() || 1115 !dwarf::isCPlusPlus( 1116 (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage())) 1117 return; 1118 1119 assert(Shape.ABI == coro::ABI::Switch && 1120 "We could only build debug infomation for C++ coroutine now.\n"); 1121 1122 DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); 1123 1124 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1125 assert(PromiseAlloca && 1126 "Coroutine with switch ABI should own Promise alloca"); 1127 1128 TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(PromiseAlloca); 1129 TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(PromiseAlloca); 1130 1131 DILocalVariable *PromiseDIVariable = nullptr; 1132 DILocation *DILoc = nullptr; 1133 if (!DIs.empty()) { 1134 DbgDeclareInst *PromiseDDI = DIs.front(); 1135 PromiseDIVariable = PromiseDDI->getVariable(); 1136 DILoc = PromiseDDI->getDebugLoc().get(); 1137 } else if (!DVRs.empty()) { 1138 DbgVariableRecord *PromiseDVR = DVRs.front(); 1139 PromiseDIVariable = PromiseDVR->getVariable(); 1140 DILoc = PromiseDVR->getDebugLoc().get(); 1141 } else { 1142 return; 1143 } 1144 1145 DILocalScope *PromiseDIScope = PromiseDIVariable->getScope(); 1146 DIFile *DFile = PromiseDIScope->getFile(); 1147 unsigned LineNum = PromiseDIVariable->getLine(); 1148 1149 DICompositeType *FrameDITy = DBuilder.createStructType( 1150 DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), 1151 DFile, LineNum, Shape.FrameSize * 8, 1152 Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, 1153 llvm::DINodeArray()); 1154 StructType *FrameTy = Shape.FrameTy; 1155 SmallVector<Metadata *, 16> Elements; 1156 DataLayout Layout = F.getDataLayout(); 1157 1158 DenseMap<Value *, DILocalVariable *> DIVarCache; 1159 cacheDIVar(FrameData, DIVarCache); 1160 1161 unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume; 1162 unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy; 1163 unsigned IndexIndex = Shape.SwitchLowering.IndexField; 1164 1165 DenseMap<unsigned, StringRef> NameCache; 1166 NameCache.insert({ResumeIndex, "__resume_fn"}); 1167 NameCache.insert({DestroyIndex, "__destroy_fn"}); 1168 NameCache.insert({IndexIndex, "__coro_index"}); 1169 1170 Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex), 1171 *DestroyFnTy = FrameTy->getElementType(DestroyIndex), 1172 *IndexTy = FrameTy->getElementType(IndexIndex); 1173 1174 DenseMap<unsigned, DIType *> TyCache; 1175 TyCache.insert( 1176 {ResumeIndex, DBuilder.createPointerType( 1177 nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); 1178 TyCache.insert( 1179 {DestroyIndex, DBuilder.createPointerType( 1180 nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); 1181 1182 /// FIXME: If we fill the field `SizeInBits` with the actual size of 1183 /// __coro_index in bits, then __coro_index wouldn't show in the debugger. 1184 TyCache.insert({IndexIndex, DBuilder.createBasicType( 1185 "__coro_index", 1186 (Layout.getTypeSizeInBits(IndexTy) < 8) 1187 ? 8 1188 : Layout.getTypeSizeInBits(IndexTy), 1189 dwarf::DW_ATE_unsigned_char)}); 1190 1191 for (auto *V : FrameData.getAllDefs()) { 1192 if (!DIVarCache.contains(V)) 1193 continue; 1194 1195 auto Index = FrameData.getFieldIndex(V); 1196 1197 NameCache.insert({Index, DIVarCache[V]->getName()}); 1198 TyCache.insert({Index, DIVarCache[V]->getType()}); 1199 } 1200 1201 // Cache from index to (Align, Offset Pair) 1202 DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache; 1203 // The Align and Offset of Resume function and Destroy function are fixed. 1204 OffsetCache.insert({ResumeIndex, {8, 0}}); 1205 OffsetCache.insert({DestroyIndex, {8, 8}}); 1206 OffsetCache.insert( 1207 {IndexIndex, 1208 {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}}); 1209 1210 for (auto *V : FrameData.getAllDefs()) { 1211 auto Index = FrameData.getFieldIndex(V); 1212 1213 OffsetCache.insert( 1214 {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}}); 1215 } 1216 1217 DenseMap<Type *, DIType *> DITypeCache; 1218 // This counter is used to avoid same type names. e.g., there would be 1219 // many i32 and i64 types in one coroutine. And we would use i32_0 and 1220 // i32_1 to avoid the same type. Since it makes no sense the name of the 1221 // fields confilicts with each other. 1222 unsigned UnknownTypeNum = 0; 1223 for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { 1224 if (!OffsetCache.contains(Index)) 1225 continue; 1226 1227 std::string Name; 1228 uint64_t SizeInBits; 1229 uint32_t AlignInBits; 1230 uint64_t OffsetInBits; 1231 DIType *DITy = nullptr; 1232 1233 Type *Ty = FrameTy->getElementType(Index); 1234 assert(Ty->isSized() && "We can't handle type which is not sized.\n"); 1235 SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue(); 1236 AlignInBits = OffsetCache[Index].first * 8; 1237 OffsetInBits = OffsetCache[Index].second * 8; 1238 1239 if (NameCache.contains(Index)) { 1240 Name = NameCache[Index].str(); 1241 DITy = TyCache[Index]; 1242 } else { 1243 DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache); 1244 assert(DITy && "SolveDIType shouldn't return nullptr.\n"); 1245 Name = DITy->getName().str(); 1246 Name += "_" + std::to_string(UnknownTypeNum); 1247 UnknownTypeNum++; 1248 } 1249 1250 Elements.push_back(DBuilder.createMemberType( 1251 FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits, 1252 llvm::DINode::FlagArtificial, DITy)); 1253 } 1254 1255 DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); 1256 1257 auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame", 1258 DFile, LineNum, FrameDITy, 1259 true, DINode::FlagArtificial); 1260 assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); 1261 1262 // Subprogram would have ContainedNodes field which records the debug 1263 // variables it contained. So we need to add __coro_frame to the 1264 // ContainedNodes of it. 1265 // 1266 // If we don't add __coro_frame to the RetainedNodes, user may get 1267 // `no symbol __coro_frame in context` rather than `__coro_frame` 1268 // is optimized out, which is more precise. 1269 if (auto *SubProgram = dyn_cast<DISubprogram>(PromiseDIScope)) { 1270 auto RetainedNodes = SubProgram->getRetainedNodes(); 1271 SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(), 1272 RetainedNodes.end()); 1273 RetainedNodesVec.push_back(FrameDIVar); 1274 SubProgram->replaceOperandWith( 1275 7, (MDTuple::get(F.getContext(), RetainedNodesVec))); 1276 } 1277 1278 if (UseNewDbgInfoFormat) { 1279 DbgVariableRecord *NewDVR = 1280 new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar, 1281 DBuilder.createExpression(), DILoc, 1282 DbgVariableRecord::LocationType::Declare); 1283 BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr(); 1284 It->getParent()->insertDbgRecordBefore(NewDVR, It); 1285 } else { 1286 DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, 1287 DBuilder.createExpression(), DILoc, 1288 &*Shape.getInsertPtAfterFramePtr()); 1289 } 1290 } 1291 1292 // Build a struct that will keep state for an active coroutine. 1293 // struct f.frame { 1294 // ResumeFnTy ResumeFnAddr; 1295 // ResumeFnTy DestroyFnAddr; 1296 // ... promise (if present) ... 1297 // int ResumeIndex; 1298 // ... spills ... 1299 // }; 1300 static StructType *buildFrameType(Function &F, coro::Shape &Shape, 1301 FrameDataInfo &FrameData) { 1302 LLVMContext &C = F.getContext(); 1303 const DataLayout &DL = F.getDataLayout(); 1304 StructType *FrameTy = [&] { 1305 SmallString<32> Name(F.getName()); 1306 Name.append(".Frame"); 1307 return StructType::create(C, Name); 1308 }(); 1309 1310 // We will use this value to cap the alignment of spilled values. 1311 std::optional<Align> MaxFrameAlignment; 1312 if (Shape.ABI == coro::ABI::Async) 1313 MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment(); 1314 FrameTypeBuilder B(C, DL, MaxFrameAlignment); 1315 1316 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1317 std::optional<FieldIDType> SwitchIndexFieldId; 1318 1319 if (Shape.ABI == coro::ABI::Switch) { 1320 auto *FnPtrTy = PointerType::getUnqual(C); 1321 1322 // Add header fields for the resume and destroy functions. 1323 // We can rely on these being perfectly packed. 1324 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1325 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1326 1327 // PromiseAlloca field needs to be explicitly added here because it's 1328 // a header field with a fixed offset based on its alignment. Hence it 1329 // needs special handling and cannot be added to FrameData.Allocas. 1330 if (PromiseAlloca) 1331 FrameData.setFieldIndex( 1332 PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true)); 1333 1334 // Add a field to store the suspend index. This doesn't need to 1335 // be in the header. 1336 unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); 1337 Type *IndexType = Type::getIntNTy(C, IndexBits); 1338 1339 SwitchIndexFieldId = B.addField(IndexType, std::nullopt); 1340 } else { 1341 assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); 1342 } 1343 1344 // Because multiple allocas may own the same field slot, 1345 // we add allocas to field here. 1346 B.addFieldForAllocas(F, FrameData, Shape); 1347 // Add PromiseAlloca to Allocas list so that 1348 // 1. updateLayoutIndex could update its index after 1349 // `performOptimizedStructLayout` 1350 // 2. it is processed in insertSpills. 1351 if (Shape.ABI == coro::ABI::Switch && PromiseAlloca) 1352 // We assume that the promise alloca won't be modified before 1353 // CoroBegin and no alias will be create before CoroBegin. 1354 FrameData.Allocas.emplace_back( 1355 PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false); 1356 // Create an entry for every spilled value. 1357 for (auto &S : FrameData.Spills) { 1358 Type *FieldType = S.first->getType(); 1359 // For byval arguments, we need to store the pointed value in the frame, 1360 // instead of the pointer itself. 1361 if (const Argument *A = dyn_cast<Argument>(S.first)) 1362 if (A->hasByValAttr()) 1363 FieldType = A->getParamByValType(); 1364 FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/, 1365 true /*IsSpillOfValue*/); 1366 FrameData.setFieldIndex(S.first, Id); 1367 } 1368 1369 B.finish(FrameTy); 1370 FrameData.updateLayoutIndex(B); 1371 Shape.FrameAlign = B.getStructAlign(); 1372 Shape.FrameSize = B.getStructSize(); 1373 1374 switch (Shape.ABI) { 1375 case coro::ABI::Switch: { 1376 // In the switch ABI, remember the switch-index field. 1377 auto IndexField = B.getLayoutField(*SwitchIndexFieldId); 1378 Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex; 1379 Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value(); 1380 Shape.SwitchLowering.IndexOffset = IndexField.Offset; 1381 1382 // Also round the frame size up to a multiple of its alignment, as is 1383 // generally expected in C/C++. 1384 Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign); 1385 break; 1386 } 1387 1388 // In the retcon ABI, remember whether the frame is inline in the storage. 1389 case coro::ABI::Retcon: 1390 case coro::ABI::RetconOnce: { 1391 auto Id = Shape.getRetconCoroId(); 1392 Shape.RetconLowering.IsFrameInlineInStorage 1393 = (B.getStructSize() <= Id->getStorageSize() && 1394 B.getStructAlign() <= Id->getStorageAlignment()); 1395 break; 1396 } 1397 case coro::ABI::Async: { 1398 Shape.AsyncLowering.FrameOffset = 1399 alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign); 1400 // Also make the final context size a multiple of the context alignment to 1401 // make allocation easier for allocators. 1402 Shape.AsyncLowering.ContextSize = 1403 alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize, 1404 Shape.AsyncLowering.getContextAlignment()); 1405 if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) { 1406 report_fatal_error( 1407 "The alignment requirment of frame variables cannot be higher than " 1408 "the alignment of the async function context"); 1409 } 1410 break; 1411 } 1412 } 1413 1414 return FrameTy; 1415 } 1416 1417 // We use a pointer use visitor to track how an alloca is being used. 1418 // The goal is to be able to answer the following three questions: 1419 // 1. Should this alloca be allocated on the frame instead. 1420 // 2. Could the content of the alloca be modified prior to CoroBegn, which would 1421 // require copying the data from alloca to the frame after CoroBegin. 1422 // 3. Is there any alias created for this alloca prior to CoroBegin, but used 1423 // after CoroBegin. In that case, we will need to recreate the alias after 1424 // CoroBegin based off the frame. To answer question 1, we track two things: 1425 // a. List of all BasicBlocks that use this alloca or any of the aliases of 1426 // the alloca. In the end, we check if there exists any two basic blocks that 1427 // cross suspension points. If so, this alloca must be put on the frame. b. 1428 // Whether the alloca or any alias of the alloca is escaped at some point, 1429 // either by storing the address somewhere, or the address is used in a 1430 // function call that might capture. If it's ever escaped, this alloca must be 1431 // put on the frame conservatively. 1432 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin. 1433 // Whenever a potential write happens, either through a store instruction, a 1434 // function call or any of the memory intrinsics, we check whether this 1435 // instruction is prior to CoroBegin. To answer question 3, we track the offsets 1436 // of all aliases created for the alloca prior to CoroBegin but used after 1437 // CoroBegin. std::optional is used to be able to represent the case when the 1438 // offset is unknown (e.g. when you have a PHINode that takes in different 1439 // offset values). We cannot handle unknown offsets and will assert. This is the 1440 // potential issue left out. An ideal solution would likely require a 1441 // significant redesign. 1442 namespace { 1443 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { 1444 using Base = PtrUseVisitor<AllocaUseVisitor>; 1445 AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, 1446 const coro::Shape &CoroShape, 1447 const SuspendCrossingInfo &Checker, 1448 bool ShouldUseLifetimeStartInfo) 1449 : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker), 1450 ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) { 1451 for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends) 1452 CoroSuspendBBs.insert(SuspendInst->getParent()); 1453 } 1454 1455 void visit(Instruction &I) { 1456 Users.insert(&I); 1457 Base::visit(I); 1458 // If the pointer is escaped prior to CoroBegin, we have to assume it would 1459 // be written into before CoroBegin as well. 1460 if (PI.isEscaped() && 1461 !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) { 1462 MayWriteBeforeCoroBegin = true; 1463 } 1464 } 1465 // We need to provide this overload as PtrUseVisitor uses a pointer based 1466 // visiting function. 1467 void visit(Instruction *I) { return visit(*I); } 1468 1469 void visitPHINode(PHINode &I) { 1470 enqueueUsers(I); 1471 handleAlias(I); 1472 } 1473 1474 void visitSelectInst(SelectInst &I) { 1475 enqueueUsers(I); 1476 handleAlias(I); 1477 } 1478 1479 void visitStoreInst(StoreInst &SI) { 1480 // Regardless whether the alias of the alloca is the value operand or the 1481 // pointer operand, we need to assume the alloca is been written. 1482 handleMayWrite(SI); 1483 1484 if (SI.getValueOperand() != U->get()) 1485 return; 1486 1487 // We are storing the pointer into a memory location, potentially escaping. 1488 // As an optimization, we try to detect simple cases where it doesn't 1489 // actually escape, for example: 1490 // %ptr = alloca .. 1491 // %addr = alloca .. 1492 // store %ptr, %addr 1493 // %x = load %addr 1494 // .. 1495 // If %addr is only used by loading from it, we could simply treat %x as 1496 // another alias of %ptr, and not considering %ptr being escaped. 1497 auto IsSimpleStoreThenLoad = [&]() { 1498 auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand()); 1499 // If the memory location we are storing to is not an alloca, it 1500 // could be an alias of some other memory locations, which is difficult 1501 // to analyze. 1502 if (!AI) 1503 return false; 1504 // StoreAliases contains aliases of the memory location stored into. 1505 SmallVector<Instruction *, 4> StoreAliases = {AI}; 1506 while (!StoreAliases.empty()) { 1507 Instruction *I = StoreAliases.pop_back_val(); 1508 for (User *U : I->users()) { 1509 // If we are loading from the memory location, we are creating an 1510 // alias of the original pointer. 1511 if (auto *LI = dyn_cast<LoadInst>(U)) { 1512 enqueueUsers(*LI); 1513 handleAlias(*LI); 1514 continue; 1515 } 1516 // If we are overriding the memory location, the pointer certainly 1517 // won't escape. 1518 if (auto *S = dyn_cast<StoreInst>(U)) 1519 if (S->getPointerOperand() == I) 1520 continue; 1521 if (auto *II = dyn_cast<IntrinsicInst>(U)) 1522 if (II->isLifetimeStartOrEnd()) 1523 continue; 1524 // BitCastInst creats aliases of the memory location being stored 1525 // into. 1526 if (auto *BI = dyn_cast<BitCastInst>(U)) { 1527 StoreAliases.push_back(BI); 1528 continue; 1529 } 1530 return false; 1531 } 1532 } 1533 1534 return true; 1535 }; 1536 1537 if (!IsSimpleStoreThenLoad()) 1538 PI.setEscaped(&SI); 1539 } 1540 1541 // All mem intrinsics modify the data. 1542 void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); } 1543 1544 void visitBitCastInst(BitCastInst &BC) { 1545 Base::visitBitCastInst(BC); 1546 handleAlias(BC); 1547 } 1548 1549 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1550 Base::visitAddrSpaceCastInst(ASC); 1551 handleAlias(ASC); 1552 } 1553 1554 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1555 // The base visitor will adjust Offset accordingly. 1556 Base::visitGetElementPtrInst(GEPI); 1557 handleAlias(GEPI); 1558 } 1559 1560 void visitIntrinsicInst(IntrinsicInst &II) { 1561 // When we found the lifetime markers refers to a 1562 // subrange of the original alloca, ignore the lifetime 1563 // markers to avoid misleading the analysis. 1564 if (!IsOffsetKnown || !Offset.isZero()) 1565 return Base::visitIntrinsicInst(II); 1566 switch (II.getIntrinsicID()) { 1567 default: 1568 return Base::visitIntrinsicInst(II); 1569 case Intrinsic::lifetime_start: 1570 LifetimeStarts.insert(&II); 1571 LifetimeStartBBs.push_back(II.getParent()); 1572 break; 1573 case Intrinsic::lifetime_end: 1574 LifetimeEndBBs.insert(II.getParent()); 1575 break; 1576 } 1577 } 1578 1579 void visitCallBase(CallBase &CB) { 1580 for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op) 1581 if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op)) 1582 PI.setEscaped(&CB); 1583 handleMayWrite(CB); 1584 } 1585 1586 bool getShouldLiveOnFrame() const { 1587 if (!ShouldLiveOnFrame) 1588 ShouldLiveOnFrame = computeShouldLiveOnFrame(); 1589 return *ShouldLiveOnFrame; 1590 } 1591 1592 bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; } 1593 1594 DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const { 1595 assert(getShouldLiveOnFrame() && "This method should only be called if the " 1596 "alloca needs to live on the frame."); 1597 for (const auto &P : AliasOffetMap) 1598 if (!P.second) 1599 report_fatal_error("Unable to handle an alias with unknown offset " 1600 "created before CoroBegin."); 1601 return AliasOffetMap; 1602 } 1603 1604 private: 1605 const DominatorTree &DT; 1606 const coro::Shape &CoroShape; 1607 const SuspendCrossingInfo &Checker; 1608 // All alias to the original AllocaInst, created before CoroBegin and used 1609 // after CoroBegin. Each entry contains the instruction and the offset in the 1610 // original Alloca. They need to be recreated after CoroBegin off the frame. 1611 DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{}; 1612 SmallPtrSet<Instruction *, 4> Users{}; 1613 SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; 1614 SmallVector<BasicBlock *> LifetimeStartBBs{}; 1615 SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{}; 1616 SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{}; 1617 bool MayWriteBeforeCoroBegin{false}; 1618 bool ShouldUseLifetimeStartInfo{true}; 1619 1620 mutable std::optional<bool> ShouldLiveOnFrame{}; 1621 1622 bool computeShouldLiveOnFrame() const { 1623 // If lifetime information is available, we check it first since it's 1624 // more precise. We look at every pair of lifetime.start intrinsic and 1625 // every basic block that uses the pointer to see if they cross suspension 1626 // points. The uses cover both direct uses as well as indirect uses. 1627 if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { 1628 // If there is no explicit lifetime.end, then assume the address can 1629 // cross suspension points. 1630 if (LifetimeEndBBs.empty()) 1631 return true; 1632 1633 // If there is a path from a lifetime.start to a suspend without a 1634 // corresponding lifetime.end, then the alloca's lifetime persists 1635 // beyond that suspension point and the alloca must go on the frame. 1636 llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs); 1637 if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs, 1638 &LifetimeEndBBs, &DT)) 1639 return true; 1640 1641 // Addresses are guaranteed to be identical after every lifetime.start so 1642 // we cannot use the local stack if the address escaped and there is a 1643 // suspend point between lifetime markers. This should also cover the 1644 // case of a single lifetime.start intrinsic in a loop with suspend point. 1645 if (PI.isEscaped()) { 1646 for (auto *A : LifetimeStarts) { 1647 for (auto *B : LifetimeStarts) { 1648 if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(), 1649 B->getParent())) 1650 return true; 1651 } 1652 } 1653 } 1654 return false; 1655 } 1656 // FIXME: Ideally the isEscaped check should come at the beginning. 1657 // However there are a few loose ends that need to be fixed first before 1658 // we can do that. We need to make sure we are not over-conservative, so 1659 // that the data accessed in-between await_suspend and symmetric transfer 1660 // is always put on the stack, and also data accessed after coro.end is 1661 // always put on the stack (esp the return object). To fix that, we need 1662 // to: 1663 // 1) Potentially treat sret as nocapture in calls 1664 // 2) Special handle the return object and put it on the stack 1665 // 3) Utilize lifetime.end intrinsic 1666 if (PI.isEscaped()) 1667 return true; 1668 1669 for (auto *U1 : Users) 1670 for (auto *U2 : Users) 1671 if (Checker.isDefinitionAcrossSuspend(*U1, U2)) 1672 return true; 1673 1674 return false; 1675 } 1676 1677 void handleMayWrite(const Instruction &I) { 1678 if (!DT.dominates(CoroShape.CoroBegin, &I)) 1679 MayWriteBeforeCoroBegin = true; 1680 } 1681 1682 bool usedAfterCoroBegin(Instruction &I) { 1683 for (auto &U : I.uses()) 1684 if (DT.dominates(CoroShape.CoroBegin, U)) 1685 return true; 1686 return false; 1687 } 1688 1689 void handleAlias(Instruction &I) { 1690 // We track all aliases created prior to CoroBegin but used after. 1691 // These aliases may need to be recreated after CoroBegin if the alloca 1692 // need to live on the frame. 1693 if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I)) 1694 return; 1695 1696 if (!IsOffsetKnown) { 1697 AliasOffetMap[&I].reset(); 1698 } else { 1699 auto Itr = AliasOffetMap.find(&I); 1700 if (Itr == AliasOffetMap.end()) { 1701 AliasOffetMap[&I] = Offset; 1702 } else if (Itr->second && *Itr->second != Offset) { 1703 // If we have seen two different possible values for this alias, we set 1704 // it to empty. 1705 AliasOffetMap[&I].reset(); 1706 } 1707 } 1708 } 1709 }; 1710 } // namespace 1711 1712 // We need to make room to insert a spill after initial PHIs, but before 1713 // catchswitch instruction. Placing it before violates the requirement that 1714 // catchswitch, like all other EHPads must be the first nonPHI in a block. 1715 // 1716 // Split away catchswitch into a separate block and insert in its place: 1717 // 1718 // cleanuppad <InsertPt> cleanupret. 1719 // 1720 // cleanupret instruction will act as an insert point for the spill. 1721 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { 1722 BasicBlock *CurrentBlock = CatchSwitch->getParent(); 1723 BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); 1724 CurrentBlock->getTerminator()->eraseFromParent(); 1725 1726 auto *CleanupPad = 1727 CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); 1728 auto *CleanupRet = 1729 CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); 1730 return CleanupRet; 1731 } 1732 1733 // Replace all alloca and SSA values that are accessed across suspend points 1734 // with GetElementPointer from coroutine frame + loads and stores. Create an 1735 // AllocaSpillBB that will become the new entry block for the resume parts of 1736 // the coroutine: 1737 // 1738 // %hdl = coro.begin(...) 1739 // whatever 1740 // 1741 // becomes: 1742 // 1743 // %hdl = coro.begin(...) 1744 // br label %AllocaSpillBB 1745 // 1746 // AllocaSpillBB: 1747 // ; geps corresponding to allocas that were moved to coroutine frame 1748 // br label PostSpill 1749 // 1750 // PostSpill: 1751 // whatever 1752 // 1753 // 1754 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { 1755 auto *CB = Shape.CoroBegin; 1756 LLVMContext &C = CB->getContext(); 1757 Function *F = CB->getFunction(); 1758 IRBuilder<> Builder(C); 1759 StructType *FrameTy = Shape.FrameTy; 1760 Value *FramePtr = Shape.FramePtr; 1761 DominatorTree DT(*F); 1762 SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; 1763 1764 // Create a GEP with the given index into the coroutine frame for the original 1765 // value Orig. Appends an extra 0 index for array-allocas, preserving the 1766 // original type. 1767 auto GetFramePointer = [&](Value *Orig) -> Value * { 1768 FieldIDType Index = FrameData.getFieldIndex(Orig); 1769 SmallVector<Value *, 3> Indices = { 1770 ConstantInt::get(Type::getInt32Ty(C), 0), 1771 ConstantInt::get(Type::getInt32Ty(C), Index), 1772 }; 1773 1774 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1775 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { 1776 auto Count = CI->getValue().getZExtValue(); 1777 if (Count > 1) { 1778 Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0)); 1779 } 1780 } else { 1781 report_fatal_error("Coroutines cannot handle non static allocas yet"); 1782 } 1783 } 1784 1785 auto GEP = cast<GetElementPtrInst>( 1786 Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); 1787 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1788 if (FrameData.getDynamicAlign(Orig) != 0) { 1789 assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value()); 1790 auto *M = AI->getModule(); 1791 auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); 1792 auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); 1793 auto *AlignMask = 1794 ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1); 1795 PtrValue = Builder.CreateAdd(PtrValue, AlignMask); 1796 PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); 1797 return Builder.CreateIntToPtr(PtrValue, AI->getType()); 1798 } 1799 // If the type of GEP is not equal to the type of AllocaInst, it implies 1800 // that the AllocaInst may be reused in the Frame slot of other 1801 // AllocaInst. So We cast GEP to the AllocaInst here to re-use 1802 // the Frame storage. 1803 // 1804 // Note: If we change the strategy dealing with alignment, we need to refine 1805 // this casting. 1806 if (GEP->getType() != Orig->getType()) 1807 return Builder.CreateAddrSpaceCast(GEP, Orig->getType(), 1808 Orig->getName() + Twine(".cast")); 1809 } 1810 return GEP; 1811 }; 1812 1813 for (auto const &E : FrameData.Spills) { 1814 Value *Def = E.first; 1815 auto SpillAlignment = Align(FrameData.getAlign(Def)); 1816 // Create a store instruction storing the value into the 1817 // coroutine frame. 1818 BasicBlock::iterator InsertPt; 1819 Type *ByValTy = nullptr; 1820 if (auto *Arg = dyn_cast<Argument>(Def)) { 1821 // For arguments, we will place the store instruction right after 1822 // the coroutine frame pointer instruction, i.e. coro.begin. 1823 InsertPt = Shape.getInsertPtAfterFramePtr(); 1824 1825 // If we're spilling an Argument, make sure we clear 'nocapture' 1826 // from the coroutine function. 1827 Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); 1828 1829 if (Arg->hasByValAttr()) 1830 ByValTy = Arg->getParamByValType(); 1831 } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) { 1832 // Don't spill immediately after a suspend; splitting assumes 1833 // that the suspend will be followed by a branch. 1834 InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt(); 1835 } else { 1836 auto *I = cast<Instruction>(Def); 1837 if (!DT.dominates(CB, I)) { 1838 // If it is not dominated by CoroBegin, then spill should be 1839 // inserted immediately after CoroFrame is computed. 1840 InsertPt = Shape.getInsertPtAfterFramePtr(); 1841 } else if (auto *II = dyn_cast<InvokeInst>(I)) { 1842 // If we are spilling the result of the invoke instruction, split 1843 // the normal edge and insert the spill in the new block. 1844 auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); 1845 InsertPt = NewBB->getTerminator()->getIterator(); 1846 } else if (isa<PHINode>(I)) { 1847 // Skip the PHINodes and EH pads instructions. 1848 BasicBlock *DefBlock = I->getParent(); 1849 if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) 1850 InsertPt = splitBeforeCatchSwitch(CSI)->getIterator(); 1851 else 1852 InsertPt = DefBlock->getFirstInsertionPt(); 1853 } else { 1854 assert(!I->isTerminator() && "unexpected terminator"); 1855 // For all other values, the spill is placed immediately after 1856 // the definition. 1857 InsertPt = I->getNextNode()->getIterator(); 1858 } 1859 } 1860 1861 auto Index = FrameData.getFieldIndex(Def); 1862 Builder.SetInsertPoint(InsertPt->getParent(), InsertPt); 1863 auto *G = Builder.CreateConstInBoundsGEP2_32( 1864 FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr")); 1865 if (ByValTy) { 1866 // For byval arguments, we need to store the pointed value in the frame, 1867 // instead of the pointer itself. 1868 auto *Value = Builder.CreateLoad(ByValTy, Def); 1869 Builder.CreateAlignedStore(Value, G, SpillAlignment); 1870 } else { 1871 Builder.CreateAlignedStore(Def, G, SpillAlignment); 1872 } 1873 1874 BasicBlock *CurrentBlock = nullptr; 1875 Value *CurrentReload = nullptr; 1876 for (auto *U : E.second) { 1877 // If we have not seen the use block, create a load instruction to reload 1878 // the spilled value from the coroutine frame. Populates the Value pointer 1879 // reference provided with the frame GEP. 1880 if (CurrentBlock != U->getParent()) { 1881 CurrentBlock = U->getParent(); 1882 Builder.SetInsertPoint(CurrentBlock, 1883 CurrentBlock->getFirstInsertionPt()); 1884 1885 auto *GEP = GetFramePointer(E.first); 1886 GEP->setName(E.first->getName() + Twine(".reload.addr")); 1887 if (ByValTy) 1888 CurrentReload = GEP; 1889 else 1890 CurrentReload = Builder.CreateAlignedLoad( 1891 FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, 1892 SpillAlignment, E.first->getName() + Twine(".reload")); 1893 1894 TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def); 1895 TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def); 1896 // Try best to find dbg.declare. If the spill is a temp, there may not 1897 // be a direct dbg.declare. Walk up the load chain to find one from an 1898 // alias. 1899 if (F->getSubprogram()) { 1900 auto *CurDef = Def; 1901 while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) { 1902 auto *LdInst = cast<LoadInst>(CurDef); 1903 // Only consider ptr to ptr same type load. 1904 if (LdInst->getPointerOperandType() != LdInst->getType()) 1905 break; 1906 CurDef = LdInst->getPointerOperand(); 1907 if (!isa<AllocaInst, LoadInst>(CurDef)) 1908 break; 1909 DIs = findDbgDeclares(CurDef); 1910 DVRs = findDVRDeclares(CurDef); 1911 } 1912 } 1913 1914 auto SalvageOne = [&](auto *DDI) { 1915 bool AllowUnresolved = false; 1916 // This dbg.declare is preserved for all coro-split function 1917 // fragments. It will be unreachable in the main function, and 1918 // processed by coro::salvageDebugInfo() by CoroCloner. 1919 if (UseNewDbgInfoFormat) { 1920 DbgVariableRecord *NewDVR = new DbgVariableRecord( 1921 ValueAsMetadata::get(CurrentReload), DDI->getVariable(), 1922 DDI->getExpression(), DDI->getDebugLoc(), 1923 DbgVariableRecord::LocationType::Declare); 1924 Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore( 1925 NewDVR, Builder.GetInsertPoint()); 1926 } else { 1927 DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) 1928 .insertDeclare(CurrentReload, DDI->getVariable(), 1929 DDI->getExpression(), DDI->getDebugLoc(), 1930 &*Builder.GetInsertPoint()); 1931 } 1932 // This dbg.declare is for the main function entry point. It 1933 // will be deleted in all coro-split functions. 1934 coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, 1935 false /*UseEntryValue*/); 1936 }; 1937 for_each(DIs, SalvageOne); 1938 for_each(DVRs, SalvageOne); 1939 } 1940 1941 // If we have a single edge PHINode, remove it and replace it with a 1942 // reload from the coroutine frame. (We already took care of multi edge 1943 // PHINodes by rewriting them in the rewritePHIs function). 1944 if (auto *PN = dyn_cast<PHINode>(U)) { 1945 assert(PN->getNumIncomingValues() == 1 && 1946 "unexpected number of incoming " 1947 "values in the PHINode"); 1948 PN->replaceAllUsesWith(CurrentReload); 1949 PN->eraseFromParent(); 1950 continue; 1951 } 1952 1953 // Replace all uses of CurrentValue in the current instruction with 1954 // reload. 1955 U->replaceUsesOfWith(Def, CurrentReload); 1956 // Instructions are added to Def's user list if the attached 1957 // debug records use Def. Update those now. 1958 for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange())) 1959 DVR.replaceVariableLocationOp(Def, CurrentReload, true); 1960 } 1961 } 1962 1963 BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent(); 1964 1965 auto SpillBlock = FramePtrBB->splitBasicBlock( 1966 Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB"); 1967 SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); 1968 Shape.AllocaSpillBlock = SpillBlock; 1969 1970 // retcon and retcon.once lowering assumes all uses have been sunk. 1971 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 1972 Shape.ABI == coro::ABI::Async) { 1973 // If we found any allocas, replace all of their remaining uses with Geps. 1974 Builder.SetInsertPoint(SpillBlock, SpillBlock->begin()); 1975 for (const auto &P : FrameData.Allocas) { 1976 AllocaInst *Alloca = P.Alloca; 1977 auto *G = GetFramePointer(Alloca); 1978 1979 // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) 1980 // here, as we are changing location of the instruction. 1981 G->takeName(Alloca); 1982 Alloca->replaceAllUsesWith(G); 1983 Alloca->eraseFromParent(); 1984 } 1985 return; 1986 } 1987 1988 // If we found any alloca, replace all of their remaining uses with GEP 1989 // instructions. To remain debugbility, we replace the uses of allocas for 1990 // dbg.declares and dbg.values with the reload from the frame. 1991 // Note: We cannot replace the alloca with GEP instructions indiscriminately, 1992 // as some of the uses may not be dominated by CoroBegin. 1993 Builder.SetInsertPoint(Shape.AllocaSpillBlock, 1994 Shape.AllocaSpillBlock->begin()); 1995 SmallVector<Instruction *, 4> UsersToUpdate; 1996 for (const auto &A : FrameData.Allocas) { 1997 AllocaInst *Alloca = A.Alloca; 1998 UsersToUpdate.clear(); 1999 for (User *U : Alloca->users()) { 2000 auto *I = cast<Instruction>(U); 2001 if (DT.dominates(CB, I)) 2002 UsersToUpdate.push_back(I); 2003 } 2004 if (UsersToUpdate.empty()) 2005 continue; 2006 auto *G = GetFramePointer(Alloca); 2007 G->setName(Alloca->getName() + Twine(".reload.addr")); 2008 2009 SmallVector<DbgVariableIntrinsic *, 4> DIs; 2010 SmallVector<DbgVariableRecord *> DbgVariableRecords; 2011 findDbgUsers(DIs, Alloca, &DbgVariableRecords); 2012 for (auto *DVI : DIs) 2013 DVI->replaceUsesOfWith(Alloca, G); 2014 for (auto *DVR : DbgVariableRecords) 2015 DVR->replaceVariableLocationOp(Alloca, G); 2016 2017 for (Instruction *I : UsersToUpdate) { 2018 // It is meaningless to retain the lifetime intrinsics refer for the 2019 // member of coroutine frames and the meaningless lifetime intrinsics 2020 // are possible to block further optimizations. 2021 if (I->isLifetimeStartOrEnd()) { 2022 I->eraseFromParent(); 2023 continue; 2024 } 2025 2026 I->replaceUsesOfWith(Alloca, G); 2027 } 2028 } 2029 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2030 for (const auto &A : FrameData.Allocas) { 2031 AllocaInst *Alloca = A.Alloca; 2032 if (A.MayWriteBeforeCoroBegin) { 2033 // isEscaped really means potentially modified before CoroBegin. 2034 if (Alloca->isArrayAllocation()) 2035 report_fatal_error( 2036 "Coroutines cannot handle copying of array allocas yet"); 2037 2038 auto *G = GetFramePointer(Alloca); 2039 auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); 2040 Builder.CreateStore(Value, G); 2041 } 2042 // For each alias to Alloca created before CoroBegin but used after 2043 // CoroBegin, we recreate them after CoroBegin by appplying the offset 2044 // to the pointer in the frame. 2045 for (const auto &Alias : A.Aliases) { 2046 auto *FramePtr = GetFramePointer(Alloca); 2047 auto &Value = *Alias.second; 2048 auto ITy = IntegerType::get(C, Value.getBitWidth()); 2049 auto *AliasPtr = 2050 Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value)); 2051 Alias.first->replaceUsesWithIf( 2052 AliasPtr, [&](Use &U) { return DT.dominates(CB, U); }); 2053 } 2054 } 2055 2056 // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle 2057 // the case that the PromiseAlloca may have writes before CoroBegin in the 2058 // above codes. And it may be problematic in edge cases. See 2059 // https://github.com/llvm/llvm-project/issues/57861 for an example. 2060 if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) { 2061 AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca; 2062 // If there is memory accessing to promise alloca before CoroBegin; 2063 bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) { 2064 auto *Inst = dyn_cast<Instruction>(U.getUser()); 2065 if (!Inst || DT.dominates(CB, Inst)) 2066 return false; 2067 2068 if (auto *CI = dyn_cast<CallInst>(Inst)) { 2069 // It is fine if the call wouldn't write to the Promise. 2070 // This is possible for @llvm.coro.id intrinsics, which 2071 // would take the promise as the second argument as a 2072 // marker. 2073 if (CI->onlyReadsMemory() || 2074 CI->onlyReadsMemory(CI->getArgOperandNo(&U))) 2075 return false; 2076 return true; 2077 } 2078 2079 return isa<StoreInst>(Inst) || 2080 // It may take too much time to track the uses. 2081 // Be conservative about the case the use may escape. 2082 isa<GetElementPtrInst>(Inst) || 2083 // There would always be a bitcast for the promise alloca 2084 // before we enabled Opaque pointers. And now given 2085 // opaque pointers are enabled by default. This should be 2086 // fine. 2087 isa<BitCastInst>(Inst); 2088 }); 2089 if (HasAccessingPromiseBeforeCB) { 2090 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2091 auto *G = GetFramePointer(PA); 2092 auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA); 2093 Builder.CreateStore(Value, G); 2094 } 2095 } 2096 } 2097 2098 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new 2099 // PHI in InsertedBB. 2100 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB, 2101 BasicBlock *InsertedBB, 2102 BasicBlock *PredBB, 2103 PHINode *UntilPHI = nullptr) { 2104 auto *PN = cast<PHINode>(&SuccBB->front()); 2105 do { 2106 int Index = PN->getBasicBlockIndex(InsertedBB); 2107 Value *V = PN->getIncomingValue(Index); 2108 PHINode *InputV = PHINode::Create( 2109 V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName()); 2110 InputV->insertBefore(InsertedBB->begin()); 2111 InputV->addIncoming(V, PredBB); 2112 PN->setIncomingValue(Index, InputV); 2113 PN = dyn_cast<PHINode>(PN->getNextNode()); 2114 } while (PN != UntilPHI); 2115 } 2116 2117 // Rewrites the PHI Nodes in a cleanuppad. 2118 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB, 2119 CleanupPadInst *CleanupPad) { 2120 // For every incoming edge to a CleanupPad we will create a new block holding 2121 // all incoming values in single-value PHI nodes. We will then create another 2122 // block to act as a dispather (as all unwind edges for related EH blocks 2123 // must be the same). 2124 // 2125 // cleanuppad: 2126 // %2 = phi i32[%0, %catchswitch], [%1, %catch.1] 2127 // %3 = cleanuppad within none [] 2128 // 2129 // It will create: 2130 // 2131 // cleanuppad.corodispatch 2132 // %2 = phi i8[0, %catchswitch], [1, %catch.1] 2133 // %3 = cleanuppad within none [] 2134 // switch i8 % 2, label %unreachable 2135 // [i8 0, label %cleanuppad.from.catchswitch 2136 // i8 1, label %cleanuppad.from.catch.1] 2137 // cleanuppad.from.catchswitch: 2138 // %4 = phi i32 [%0, %catchswitch] 2139 // br %label cleanuppad 2140 // cleanuppad.from.catch.1: 2141 // %6 = phi i32 [%1, %catch.1] 2142 // br %label cleanuppad 2143 // cleanuppad: 2144 // %8 = phi i32 [%4, %cleanuppad.from.catchswitch], 2145 // [%6, %cleanuppad.from.catch.1] 2146 2147 // Unreachable BB, in case switching on an invalid value in the dispatcher. 2148 auto *UnreachBB = BasicBlock::Create( 2149 CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent()); 2150 IRBuilder<> Builder(UnreachBB); 2151 Builder.CreateUnreachable(); 2152 2153 // Create a new cleanuppad which will be the dispatcher. 2154 auto *NewCleanupPadBB = 2155 BasicBlock::Create(CleanupPadBB->getContext(), 2156 CleanupPadBB->getName() + Twine(".corodispatch"), 2157 CleanupPadBB->getParent(), CleanupPadBB); 2158 Builder.SetInsertPoint(NewCleanupPadBB); 2159 auto *SwitchType = Builder.getInt8Ty(); 2160 auto *SetDispatchValuePN = 2161 Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB)); 2162 CleanupPad->removeFromParent(); 2163 CleanupPad->insertAfter(SetDispatchValuePN); 2164 auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB, 2165 pred_size(CleanupPadBB)); 2166 2167 int SwitchIndex = 0; 2168 SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB)); 2169 for (BasicBlock *Pred : Preds) { 2170 // Create a new cleanuppad and move the PHI values to there. 2171 auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(), 2172 CleanupPadBB->getName() + 2173 Twine(".from.") + Pred->getName(), 2174 CleanupPadBB->getParent(), CleanupPadBB); 2175 updatePhiNodes(CleanupPadBB, Pred, CaseBB); 2176 CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") + 2177 Pred->getName()); 2178 Builder.SetInsertPoint(CaseBB); 2179 Builder.CreateBr(CleanupPadBB); 2180 movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB); 2181 2182 // Update this Pred to the new unwind point. 2183 setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB); 2184 2185 // Setup the switch in the dispatcher. 2186 auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex); 2187 SetDispatchValuePN->addIncoming(SwitchConstant, Pred); 2188 SwitchOnDispatch->addCase(SwitchConstant, CaseBB); 2189 SwitchIndex++; 2190 } 2191 } 2192 2193 static void cleanupSinglePredPHIs(Function &F) { 2194 SmallVector<PHINode *, 32> Worklist; 2195 for (auto &BB : F) { 2196 for (auto &Phi : BB.phis()) { 2197 if (Phi.getNumIncomingValues() == 1) { 2198 Worklist.push_back(&Phi); 2199 } else 2200 break; 2201 } 2202 } 2203 while (!Worklist.empty()) { 2204 auto *Phi = Worklist.pop_back_val(); 2205 auto *OriginalValue = Phi->getIncomingValue(0); 2206 Phi->replaceAllUsesWith(OriginalValue); 2207 } 2208 } 2209 2210 static void rewritePHIs(BasicBlock &BB) { 2211 // For every incoming edge we will create a block holding all 2212 // incoming values in a single PHI nodes. 2213 // 2214 // loop: 2215 // %n.val = phi i32[%n, %entry], [%inc, %loop] 2216 // 2217 // It will create: 2218 // 2219 // loop.from.entry: 2220 // %n.loop.pre = phi i32 [%n, %entry] 2221 // br %label loop 2222 // loop.from.loop: 2223 // %inc.loop.pre = phi i32 [%inc, %loop] 2224 // br %label loop 2225 // 2226 // After this rewrite, further analysis will ignore any phi nodes with more 2227 // than one incoming edge. 2228 2229 // TODO: Simplify PHINodes in the basic block to remove duplicate 2230 // predecessors. 2231 2232 // Special case for CleanupPad: all EH blocks must have the same unwind edge 2233 // so we need to create an additional "dispatcher" block. 2234 if (auto *CleanupPad = 2235 dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) { 2236 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2237 for (BasicBlock *Pred : Preds) { 2238 if (CatchSwitchInst *CS = 2239 dyn_cast<CatchSwitchInst>(Pred->getTerminator())) { 2240 // CleanupPad with a CatchSwitch predecessor: therefore this is an 2241 // unwind destination that needs to be handle specially. 2242 assert(CS->getUnwindDest() == &BB); 2243 (void)CS; 2244 rewritePHIsForCleanupPad(&BB, CleanupPad); 2245 return; 2246 } 2247 } 2248 } 2249 2250 LandingPadInst *LandingPad = nullptr; 2251 PHINode *ReplPHI = nullptr; 2252 if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { 2253 // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. 2254 // We replace the original landing pad with a PHINode that will collect the 2255 // results from all of them. 2256 ReplPHI = PHINode::Create(LandingPad->getType(), 1, ""); 2257 ReplPHI->insertBefore(LandingPad->getIterator()); 2258 ReplPHI->takeName(LandingPad); 2259 LandingPad->replaceAllUsesWith(ReplPHI); 2260 // We will erase the original landing pad at the end of this function after 2261 // ehAwareSplitEdge cloned it in the transition blocks. 2262 } 2263 2264 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2265 for (BasicBlock *Pred : Preds) { 2266 auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); 2267 IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); 2268 2269 // Stop the moving of values at ReplPHI, as this is either null or the PHI 2270 // that replaced the landing pad. 2271 movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI); 2272 } 2273 2274 if (LandingPad) { 2275 // Calls to ehAwareSplitEdge function cloned the original lading pad. 2276 // No longer need it. 2277 LandingPad->eraseFromParent(); 2278 } 2279 } 2280 2281 static void rewritePHIs(Function &F) { 2282 SmallVector<BasicBlock *, 8> WorkList; 2283 2284 for (BasicBlock &BB : F) 2285 if (auto *PN = dyn_cast<PHINode>(&BB.front())) 2286 if (PN->getNumIncomingValues() > 1) 2287 WorkList.push_back(&BB); 2288 2289 for (BasicBlock *BB : WorkList) 2290 rewritePHIs(*BB); 2291 } 2292 2293 /// Default materializable callback 2294 // Check for instructions that we can recreate on resume as opposed to spill 2295 // the result into a coroutine frame. 2296 bool coro::defaultMaterializable(Instruction &V) { 2297 return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || 2298 isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); 2299 } 2300 2301 // Check for structural coroutine intrinsics that should not be spilled into 2302 // the coroutine frame. 2303 static bool isCoroutineStructureIntrinsic(Instruction &I) { 2304 return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) || 2305 isa<CoroSuspendInst>(&I); 2306 } 2307 2308 // For each instruction identified as materializable across the suspend point, 2309 // and its associated DAG of other rematerializable instructions, 2310 // recreate the DAG of instructions after the suspend point. 2311 static void rewriteMaterializableInstructions( 2312 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> 2313 &AllRemats) { 2314 // This has to be done in 2 phases 2315 // Do the remats and record the required defs to be replaced in the 2316 // original use instructions 2317 // Once all the remats are complete, replace the uses in the final 2318 // instructions with the new defs 2319 typedef struct { 2320 Instruction *Use; 2321 Instruction *Def; 2322 Instruction *Remat; 2323 } ProcessNode; 2324 2325 SmallVector<ProcessNode> FinalInstructionsToProcess; 2326 2327 for (const auto &E : AllRemats) { 2328 Instruction *Use = E.first; 2329 Instruction *CurrentMaterialization = nullptr; 2330 RematGraph *RG = E.second.get(); 2331 ReversePostOrderTraversal<RematGraph *> RPOT(RG); 2332 SmallVector<Instruction *> InstructionsToProcess; 2333 2334 // If the target use is actually a suspend instruction then we have to 2335 // insert the remats into the end of the predecessor (there should only be 2336 // one). This is so that suspend blocks always have the suspend instruction 2337 // as the first instruction. 2338 auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); 2339 if (isa<AnyCoroSuspendInst>(Use)) { 2340 BasicBlock *SuspendPredecessorBlock = 2341 Use->getParent()->getSinglePredecessor(); 2342 assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); 2343 InsertPoint = SuspendPredecessorBlock->getTerminator(); 2344 } 2345 2346 // Note: skip the first instruction as this is the actual use that we're 2347 // rematerializing everything for. 2348 auto I = RPOT.begin(); 2349 ++I; 2350 for (; I != RPOT.end(); ++I) { 2351 Instruction *D = (*I)->Node; 2352 CurrentMaterialization = D->clone(); 2353 CurrentMaterialization->setName(D->getName()); 2354 CurrentMaterialization->insertBefore(InsertPoint); 2355 InsertPoint = CurrentMaterialization; 2356 2357 // Replace all uses of Def in the instructions being added as part of this 2358 // rematerialization group 2359 for (auto &I : InstructionsToProcess) 2360 I->replaceUsesOfWith(D, CurrentMaterialization); 2361 2362 // Don't replace the final use at this point as this can cause problems 2363 // for other materializations. Instead, for any final use that uses a 2364 // define that's being rematerialized, record the replace values 2365 for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) 2366 if (Use->getOperand(i) == D) // Is this operand pointing to oldval? 2367 FinalInstructionsToProcess.push_back( 2368 {Use, D, CurrentMaterialization}); 2369 2370 InstructionsToProcess.push_back(CurrentMaterialization); 2371 } 2372 } 2373 2374 // Finally, replace the uses with the defines that we've just rematerialized 2375 for (auto &R : FinalInstructionsToProcess) { 2376 if (auto *PN = dyn_cast<PHINode>(R.Use)) { 2377 assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " 2378 "values in the PHINode"); 2379 PN->replaceAllUsesWith(R.Remat); 2380 PN->eraseFromParent(); 2381 continue; 2382 } 2383 R.Use->replaceUsesOfWith(R.Def, R.Remat); 2384 } 2385 } 2386 2387 // Splits the block at a particular instruction unless it is the first 2388 // instruction in the block with a single predecessor. 2389 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { 2390 auto *BB = I->getParent(); 2391 if (&BB->front() == I) { 2392 if (BB->getSinglePredecessor()) { 2393 BB->setName(Name); 2394 return BB; 2395 } 2396 } 2397 return BB->splitBasicBlock(I, Name); 2398 } 2399 2400 // Split above and below a particular instruction so that it 2401 // will be all alone by itself in a block. 2402 static void splitAround(Instruction *I, const Twine &Name) { 2403 splitBlockIfNotFirst(I, Name); 2404 splitBlockIfNotFirst(I->getNextNode(), "After" + Name); 2405 } 2406 2407 static bool isSuspendBlock(BasicBlock *BB) { 2408 return isa<AnyCoroSuspendInst>(BB->front()); 2409 } 2410 2411 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; 2412 2413 /// Does control flow starting at the given block ever reach a suspend 2414 /// instruction before reaching a block in VisitedOrFreeBBs? 2415 static bool isSuspendReachableFrom(BasicBlock *From, 2416 VisitedBlocksSet &VisitedOrFreeBBs) { 2417 // Eagerly try to add this block to the visited set. If it's already 2418 // there, stop recursing; this path doesn't reach a suspend before 2419 // either looping or reaching a freeing block. 2420 if (!VisitedOrFreeBBs.insert(From).second) 2421 return false; 2422 2423 // We assume that we'll already have split suspends into their own blocks. 2424 if (isSuspendBlock(From)) 2425 return true; 2426 2427 // Recurse on the successors. 2428 for (auto *Succ : successors(From)) { 2429 if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) 2430 return true; 2431 } 2432 2433 return false; 2434 } 2435 2436 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a 2437 /// suspend point? 2438 static bool isLocalAlloca(CoroAllocaAllocInst *AI) { 2439 // Seed the visited set with all the basic blocks containing a free 2440 // so that we won't pass them up. 2441 VisitedBlocksSet VisitedOrFreeBBs; 2442 for (auto *User : AI->users()) { 2443 if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) 2444 VisitedOrFreeBBs.insert(FI->getParent()); 2445 } 2446 2447 return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); 2448 } 2449 2450 /// After we split the coroutine, will the given basic block be along 2451 /// an obvious exit path for the resumption function? 2452 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, 2453 unsigned depth = 3) { 2454 // If we've bottomed out our depth count, stop searching and assume 2455 // that the path might loop back. 2456 if (depth == 0) return false; 2457 2458 // If this is a suspend block, we're about to exit the resumption function. 2459 if (isSuspendBlock(BB)) return true; 2460 2461 // Recurse into the successors. 2462 for (auto *Succ : successors(BB)) { 2463 if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) 2464 return false; 2465 } 2466 2467 // If none of the successors leads back in a loop, we're on an exit/abort. 2468 return true; 2469 } 2470 2471 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { 2472 // Look for a free that isn't sufficiently obviously followed by 2473 // either a suspend or a termination, i.e. something that will leave 2474 // the coro resumption frame. 2475 for (auto *U : AI->users()) { 2476 auto FI = dyn_cast<CoroAllocaFreeInst>(U); 2477 if (!FI) continue; 2478 2479 if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) 2480 return true; 2481 } 2482 2483 // If we never found one, we don't need a stack save. 2484 return false; 2485 } 2486 2487 /// Turn each of the given local allocas into a normal (dynamic) alloca 2488 /// instruction. 2489 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, 2490 SmallVectorImpl<Instruction*> &DeadInsts) { 2491 for (auto *AI : LocalAllocas) { 2492 IRBuilder<> Builder(AI); 2493 2494 // Save the stack depth. Try to avoid doing this if the stackrestore 2495 // is going to immediately precede a return or something. 2496 Value *StackSave = nullptr; 2497 if (localAllocaNeedsStackSave(AI)) 2498 StackSave = Builder.CreateStackSave(); 2499 2500 // Allocate memory. 2501 auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); 2502 Alloca->setAlignment(AI->getAlignment()); 2503 2504 for (auto *U : AI->users()) { 2505 // Replace gets with the allocation. 2506 if (isa<CoroAllocaGetInst>(U)) { 2507 U->replaceAllUsesWith(Alloca); 2508 2509 // Replace frees with stackrestores. This is safe because 2510 // alloca.alloc is required to obey a stack discipline, although we 2511 // don't enforce that structurally. 2512 } else { 2513 auto FI = cast<CoroAllocaFreeInst>(U); 2514 if (StackSave) { 2515 Builder.SetInsertPoint(FI); 2516 Builder.CreateStackRestore(StackSave); 2517 } 2518 } 2519 DeadInsts.push_back(cast<Instruction>(U)); 2520 } 2521 2522 DeadInsts.push_back(AI); 2523 } 2524 } 2525 2526 /// Turn the given coro.alloca.alloc call into a dynamic allocation. 2527 /// This happens during the all-instructions iteration, so it must not 2528 /// delete the call. 2529 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, 2530 coro::Shape &Shape, 2531 SmallVectorImpl<Instruction*> &DeadInsts) { 2532 IRBuilder<> Builder(AI); 2533 auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); 2534 2535 for (User *U : AI->users()) { 2536 if (isa<CoroAllocaGetInst>(U)) { 2537 U->replaceAllUsesWith(Alloc); 2538 } else { 2539 auto FI = cast<CoroAllocaFreeInst>(U); 2540 Builder.SetInsertPoint(FI); 2541 Shape.emitDealloc(Builder, Alloc, nullptr); 2542 } 2543 DeadInsts.push_back(cast<Instruction>(U)); 2544 } 2545 2546 // Push this on last so that it gets deleted after all the others. 2547 DeadInsts.push_back(AI); 2548 2549 // Return the new allocation value so that we can check for needed spills. 2550 return cast<Instruction>(Alloc); 2551 } 2552 2553 /// Get the current swifterror value. 2554 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, 2555 coro::Shape &Shape) { 2556 // Make a fake function pointer as a sort of intrinsic. 2557 auto FnTy = FunctionType::get(ValueTy, {}, false); 2558 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2559 2560 auto Call = Builder.CreateCall(FnTy, Fn, {}); 2561 Shape.SwiftErrorOps.push_back(Call); 2562 2563 return Call; 2564 } 2565 2566 /// Set the given value as the current swifterror value. 2567 /// 2568 /// Returns a slot that can be used as a swifterror slot. 2569 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, 2570 coro::Shape &Shape) { 2571 // Make a fake function pointer as a sort of intrinsic. 2572 auto FnTy = FunctionType::get(Builder.getPtrTy(), 2573 {V->getType()}, false); 2574 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2575 2576 auto Call = Builder.CreateCall(FnTy, Fn, { V }); 2577 Shape.SwiftErrorOps.push_back(Call); 2578 2579 return Call; 2580 } 2581 2582 /// Set the swifterror value from the given alloca before a call, 2583 /// then put in back in the alloca afterwards. 2584 /// 2585 /// Returns an address that will stand in for the swifterror slot 2586 /// until splitting. 2587 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, 2588 AllocaInst *Alloca, 2589 coro::Shape &Shape) { 2590 auto ValueTy = Alloca->getAllocatedType(); 2591 IRBuilder<> Builder(Call); 2592 2593 // Load the current value from the alloca and set it as the 2594 // swifterror value. 2595 auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); 2596 auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); 2597 2598 // Move to after the call. Since swifterror only has a guaranteed 2599 // value on normal exits, we can ignore implicit and explicit unwind 2600 // edges. 2601 if (isa<CallInst>(Call)) { 2602 Builder.SetInsertPoint(Call->getNextNode()); 2603 } else { 2604 auto Invoke = cast<InvokeInst>(Call); 2605 Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); 2606 } 2607 2608 // Get the current swifterror value and store it to the alloca. 2609 auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); 2610 Builder.CreateStore(ValueAfterCall, Alloca); 2611 2612 return Addr; 2613 } 2614 2615 /// Eliminate a formerly-swifterror alloca by inserting the get/set 2616 /// intrinsics and attempting to MemToReg the alloca away. 2617 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, 2618 coro::Shape &Shape) { 2619 for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) { 2620 // swifterror values can only be used in very specific ways. 2621 // We take advantage of that here. 2622 auto User = Use.getUser(); 2623 if (isa<LoadInst>(User) || isa<StoreInst>(User)) 2624 continue; 2625 2626 assert(isa<CallInst>(User) || isa<InvokeInst>(User)); 2627 auto Call = cast<Instruction>(User); 2628 2629 auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); 2630 2631 // Use the returned slot address as the call argument. 2632 Use.set(Addr); 2633 } 2634 2635 // All the uses should be loads and stores now. 2636 assert(isAllocaPromotable(Alloca)); 2637 } 2638 2639 /// "Eliminate" a swifterror argument by reducing it to the alloca case 2640 /// and then loading and storing in the prologue and epilog. 2641 /// 2642 /// The argument keeps the swifterror flag. 2643 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, 2644 coro::Shape &Shape, 2645 SmallVectorImpl<AllocaInst*> &AllocasToPromote) { 2646 IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); 2647 2648 auto ArgTy = cast<PointerType>(Arg.getType()); 2649 auto ValueTy = PointerType::getUnqual(F.getContext()); 2650 2651 // Reduce to the alloca case: 2652 2653 // Create an alloca and replace all uses of the arg with it. 2654 auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); 2655 Arg.replaceAllUsesWith(Alloca); 2656 2657 // Set an initial value in the alloca. swifterror is always null on entry. 2658 auto InitialValue = Constant::getNullValue(ValueTy); 2659 Builder.CreateStore(InitialValue, Alloca); 2660 2661 // Find all the suspends in the function and save and restore around them. 2662 for (auto *Suspend : Shape.CoroSuspends) { 2663 (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); 2664 } 2665 2666 // Find all the coro.ends in the function and restore the error value. 2667 for (auto *End : Shape.CoroEnds) { 2668 Builder.SetInsertPoint(End); 2669 auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); 2670 (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); 2671 } 2672 2673 // Now we can use the alloca logic. 2674 AllocasToPromote.push_back(Alloca); 2675 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2676 } 2677 2678 /// Eliminate all problematic uses of swifterror arguments and allocas 2679 /// from the function. We'll fix them up later when splitting the function. 2680 static void eliminateSwiftError(Function &F, coro::Shape &Shape) { 2681 SmallVector<AllocaInst*, 4> AllocasToPromote; 2682 2683 // Look for a swifterror argument. 2684 for (auto &Arg : F.args()) { 2685 if (!Arg.hasSwiftErrorAttr()) continue; 2686 2687 eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); 2688 break; 2689 } 2690 2691 // Look for swifterror allocas. 2692 for (auto &Inst : F.getEntryBlock()) { 2693 auto Alloca = dyn_cast<AllocaInst>(&Inst); 2694 if (!Alloca || !Alloca->isSwiftError()) continue; 2695 2696 // Clear the swifterror flag. 2697 Alloca->setSwiftError(false); 2698 2699 AllocasToPromote.push_back(Alloca); 2700 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2701 } 2702 2703 // If we have any allocas to promote, compute a dominator tree and 2704 // promote them en masse. 2705 if (!AllocasToPromote.empty()) { 2706 DominatorTree DT(F); 2707 PromoteMemToReg(AllocasToPromote, DT); 2708 } 2709 } 2710 2711 /// retcon and retcon.once conventions assume that all spill uses can be sunk 2712 /// after the coro.begin intrinsic. 2713 static void sinkSpillUsesAfterCoroBegin(Function &F, 2714 const FrameDataInfo &FrameData, 2715 CoroBeginInst *CoroBegin) { 2716 DominatorTree Dom(F); 2717 2718 SmallSetVector<Instruction *, 32> ToMove; 2719 SmallVector<Instruction *, 32> Worklist; 2720 2721 // Collect all users that precede coro.begin. 2722 for (auto *Def : FrameData.getAllDefs()) { 2723 for (User *U : Def->users()) { 2724 auto Inst = cast<Instruction>(U); 2725 if (Inst->getParent() != CoroBegin->getParent() || 2726 Dom.dominates(CoroBegin, Inst)) 2727 continue; 2728 if (ToMove.insert(Inst)) 2729 Worklist.push_back(Inst); 2730 } 2731 } 2732 // Recursively collect users before coro.begin. 2733 while (!Worklist.empty()) { 2734 auto *Def = Worklist.pop_back_val(); 2735 for (User *U : Def->users()) { 2736 auto Inst = cast<Instruction>(U); 2737 if (Dom.dominates(CoroBegin, Inst)) 2738 continue; 2739 if (ToMove.insert(Inst)) 2740 Worklist.push_back(Inst); 2741 } 2742 } 2743 2744 // Sort by dominance. 2745 SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end()); 2746 llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool { 2747 // If a dominates b it should preceed (<) b. 2748 return Dom.dominates(A, B); 2749 }); 2750 2751 Instruction *InsertPt = CoroBegin->getNextNode(); 2752 for (Instruction *Inst : InsertionList) 2753 Inst->moveBefore(InsertPt); 2754 } 2755 2756 /// For each local variable that all of its user are only used inside one of 2757 /// suspended region, we sink their lifetime.start markers to the place where 2758 /// after the suspend block. Doing so minimizes the lifetime of each variable, 2759 /// hence minimizing the amount of data we end up putting on the frame. 2760 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, 2761 SuspendCrossingInfo &Checker, 2762 const DominatorTree &DT) { 2763 if (F.hasOptNone()) 2764 return; 2765 2766 // Collect all possible basic blocks which may dominate all uses of allocas. 2767 SmallPtrSet<BasicBlock *, 4> DomSet; 2768 DomSet.insert(&F.getEntryBlock()); 2769 for (auto *CSI : Shape.CoroSuspends) { 2770 BasicBlock *SuspendBlock = CSI->getParent(); 2771 assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && 2772 "should have split coro.suspend into its own block"); 2773 DomSet.insert(SuspendBlock->getSingleSuccessor()); 2774 } 2775 2776 for (Instruction &I : instructions(F)) { 2777 AllocaInst* AI = dyn_cast<AllocaInst>(&I); 2778 if (!AI) 2779 continue; 2780 2781 for (BasicBlock *DomBB : DomSet) { 2782 bool Valid = true; 2783 SmallVector<Instruction *, 1> Lifetimes; 2784 2785 auto isLifetimeStart = [](Instruction* I) { 2786 if (auto* II = dyn_cast<IntrinsicInst>(I)) 2787 return II->getIntrinsicID() == Intrinsic::lifetime_start; 2788 return false; 2789 }; 2790 2791 auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) { 2792 if (isLifetimeStart(U)) { 2793 Lifetimes.push_back(U); 2794 return true; 2795 } 2796 if (!U->hasOneUse() || U->stripPointerCasts() != AI) 2797 return false; 2798 if (isLifetimeStart(U->user_back())) { 2799 Lifetimes.push_back(U->user_back()); 2800 return true; 2801 } 2802 return false; 2803 }; 2804 2805 for (User *U : AI->users()) { 2806 Instruction *UI = cast<Instruction>(U); 2807 // For all users except lifetime.start markers, if they are all 2808 // dominated by one of the basic blocks and do not cross 2809 // suspend points as well, then there is no need to spill the 2810 // instruction. 2811 if (!DT.dominates(DomBB, UI->getParent()) || 2812 Checker.isDefinitionAcrossSuspend(DomBB, UI)) { 2813 // Skip lifetime.start, GEP and bitcast used by lifetime.start 2814 // markers. 2815 if (collectLifetimeStart(UI, AI)) 2816 continue; 2817 Valid = false; 2818 break; 2819 } 2820 } 2821 // Sink lifetime.start markers to dominate block when they are 2822 // only used outside the region. 2823 if (Valid && Lifetimes.size() != 0) { 2824 auto *NewLifetime = Lifetimes[0]->clone(); 2825 NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); 2826 NewLifetime->insertBefore(DomBB->getTerminator()); 2827 2828 // All the outsided lifetime.start markers are no longer necessary. 2829 for (Instruction *S : Lifetimes) 2830 S->eraseFromParent(); 2831 2832 break; 2833 } 2834 } 2835 } 2836 } 2837 2838 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, 2839 const SuspendCrossingInfo &Checker, 2840 SmallVectorImpl<AllocaInfo> &Allocas, 2841 const DominatorTree &DT) { 2842 if (Shape.CoroSuspends.empty()) 2843 return; 2844 2845 // The PromiseAlloca will be specially handled since it needs to be in a 2846 // fixed position in the frame. 2847 if (AI == Shape.SwitchLowering.PromiseAlloca) 2848 return; 2849 2850 // The __coro_gro alloca should outlive the promise, make sure we 2851 // keep it outside the frame. 2852 if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame)) 2853 return; 2854 2855 // The code that uses lifetime.start intrinsic does not work for functions 2856 // with loops without exit. Disable it on ABIs we know to generate such 2857 // code. 2858 bool ShouldUseLifetimeStartInfo = 2859 (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 2860 Shape.ABI != coro::ABI::RetconOnce); 2861 AllocaUseVisitor Visitor{AI->getDataLayout(), DT, Shape, Checker, 2862 ShouldUseLifetimeStartInfo}; 2863 Visitor.visitPtr(*AI); 2864 if (!Visitor.getShouldLiveOnFrame()) 2865 return; 2866 Allocas.emplace_back(AI, Visitor.getAliasesCopy(), 2867 Visitor.getMayWriteBeforeCoroBegin()); 2868 } 2869 2870 static std::optional<std::pair<Value &, DIExpression &>> 2871 salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2872 bool OptimizeFrame, bool UseEntryValue, Function *F, 2873 Value *Storage, DIExpression *Expr, 2874 bool SkipOutermostLoad) { 2875 IRBuilder<> Builder(F->getContext()); 2876 auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); 2877 while (isa<IntrinsicInst>(InsertPt)) 2878 ++InsertPt; 2879 Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt); 2880 2881 while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { 2882 if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { 2883 Storage = LdInst->getPointerOperand(); 2884 // FIXME: This is a heuristic that works around the fact that 2885 // LLVM IR debug intrinsics cannot yet distinguish between 2886 // memory and value locations: Because a dbg.declare(alloca) is 2887 // implicitly a memory location no DW_OP_deref operation for the 2888 // last direct load from an alloca is necessary. This condition 2889 // effectively drops the *last* DW_OP_deref in the expression. 2890 if (!SkipOutermostLoad) 2891 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2892 } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { 2893 Storage = StInst->getValueOperand(); 2894 } else { 2895 SmallVector<uint64_t, 16> Ops; 2896 SmallVector<Value *, 0> AdditionalValues; 2897 Value *Op = llvm::salvageDebugInfoImpl( 2898 *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops, 2899 AdditionalValues); 2900 if (!Op || !AdditionalValues.empty()) { 2901 // If salvaging failed or salvaging produced more than one location 2902 // operand, give up. 2903 break; 2904 } 2905 Storage = Op; 2906 Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false); 2907 } 2908 SkipOutermostLoad = false; 2909 } 2910 if (!Storage) 2911 return std::nullopt; 2912 2913 auto *StorageAsArg = dyn_cast<Argument>(Storage); 2914 const bool IsSwiftAsyncArg = 2915 StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); 2916 2917 // Swift async arguments are described by an entry value of the ABI-defined 2918 // register containing the coroutine context. 2919 // Entry values in variadic expressions are not supported. 2920 if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() && 2921 Expr->isSingleLocationExpression()) 2922 Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); 2923 2924 // If the coroutine frame is an Argument, store it in an alloca to improve 2925 // its availability (e.g. registers may be clobbered). 2926 // Avoid this if optimizations are enabled (they would remove the alloca) or 2927 // if the value is guaranteed to be available through other means (e.g. swift 2928 // ABI guarantees). 2929 if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { 2930 auto &Cached = ArgToAllocaMap[StorageAsArg]; 2931 if (!Cached) { 2932 Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, 2933 Storage->getName() + ".debug"); 2934 Builder.CreateStore(Storage, Cached); 2935 } 2936 Storage = Cached; 2937 // FIXME: LLVM lacks nuanced semantics to differentiate between 2938 // memory and direct locations at the IR level. The backend will 2939 // turn a dbg.declare(alloca, ..., DIExpression()) into a memory 2940 // location. Thus, if there are deref and offset operations in the 2941 // expression, we need to add a DW_OP_deref at the *start* of the 2942 // expression to first load the contents of the alloca before 2943 // adjusting it with the expression. 2944 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2945 } 2946 2947 return {{*Storage, *Expr}}; 2948 } 2949 2950 void coro::salvageDebugInfo( 2951 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2952 DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) { 2953 2954 Function *F = DVI.getFunction(); 2955 // Follow the pointer arithmetic all the way to the incoming 2956 // function argument and convert into a DIExpression. 2957 bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); 2958 Value *OriginalStorage = DVI.getVariableLocationOp(0); 2959 2960 auto SalvagedInfo = ::salvageDebugInfoImpl( 2961 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2962 DVI.getExpression(), SkipOutermostLoad); 2963 if (!SalvagedInfo) 2964 return; 2965 2966 Value *Storage = &SalvagedInfo->first; 2967 DIExpression *Expr = &SalvagedInfo->second; 2968 2969 DVI.replaceVariableLocationOp(OriginalStorage, Storage); 2970 DVI.setExpression(Expr); 2971 // We only hoist dbg.declare today since it doesn't make sense to hoist 2972 // dbg.value since it does not have the same function wide guarantees that 2973 // dbg.declare does. 2974 if (isa<DbgDeclareInst>(DVI)) { 2975 std::optional<BasicBlock::iterator> InsertPt; 2976 if (auto *I = dyn_cast<Instruction>(Storage)) { 2977 InsertPt = I->getInsertionPointAfterDef(); 2978 // Update DILocation only if variable was not inlined. 2979 DebugLoc ILoc = I->getDebugLoc(); 2980 DebugLoc DVILoc = DVI.getDebugLoc(); 2981 if (ILoc && DVILoc && 2982 DVILoc->getScope()->getSubprogram() == 2983 ILoc->getScope()->getSubprogram()) 2984 DVI.setDebugLoc(I->getDebugLoc()); 2985 } else if (isa<Argument>(Storage)) 2986 InsertPt = F->getEntryBlock().begin(); 2987 if (InsertPt) 2988 DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); 2989 } 2990 } 2991 2992 void coro::salvageDebugInfo( 2993 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2994 DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) { 2995 2996 Function *F = DVR.getFunction(); 2997 // Follow the pointer arithmetic all the way to the incoming 2998 // function argument and convert into a DIExpression. 2999 bool SkipOutermostLoad = DVR.isDbgDeclare(); 3000 Value *OriginalStorage = DVR.getVariableLocationOp(0); 3001 3002 auto SalvagedInfo = ::salvageDebugInfoImpl( 3003 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 3004 DVR.getExpression(), SkipOutermostLoad); 3005 if (!SalvagedInfo) 3006 return; 3007 3008 Value *Storage = &SalvagedInfo->first; 3009 DIExpression *Expr = &SalvagedInfo->second; 3010 3011 DVR.replaceVariableLocationOp(OriginalStorage, Storage); 3012 DVR.setExpression(Expr); 3013 // We only hoist dbg.declare today since it doesn't make sense to hoist 3014 // dbg.value since it does not have the same function wide guarantees that 3015 // dbg.declare does. 3016 if (DVR.getType() == DbgVariableRecord::LocationType::Declare) { 3017 std::optional<BasicBlock::iterator> InsertPt; 3018 if (auto *I = dyn_cast<Instruction>(Storage)) { 3019 InsertPt = I->getInsertionPointAfterDef(); 3020 // Update DILocation only if variable was not inlined. 3021 DebugLoc ILoc = I->getDebugLoc(); 3022 DebugLoc DVRLoc = DVR.getDebugLoc(); 3023 if (ILoc && DVRLoc && 3024 DVRLoc->getScope()->getSubprogram() == 3025 ILoc->getScope()->getSubprogram()) 3026 DVR.setDebugLoc(ILoc); 3027 } else if (isa<Argument>(Storage)) 3028 InsertPt = F->getEntryBlock().begin(); 3029 if (InsertPt) { 3030 DVR.removeFromParent(); 3031 (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt); 3032 } 3033 } 3034 } 3035 3036 static void doRematerializations( 3037 Function &F, SuspendCrossingInfo &Checker, 3038 const std::function<bool(Instruction &)> &MaterializableCallback) { 3039 if (F.hasOptNone()) 3040 return; 3041 3042 SpillInfo Spills; 3043 3044 // See if there are materializable instructions across suspend points 3045 // We record these as the starting point to also identify materializable 3046 // defs of uses in these operations 3047 for (Instruction &I : instructions(F)) { 3048 if (!MaterializableCallback(I)) 3049 continue; 3050 for (User *U : I.users()) 3051 if (Checker.isDefinitionAcrossSuspend(I, U)) 3052 Spills[&I].push_back(cast<Instruction>(U)); 3053 } 3054 3055 // Process each of the identified rematerializable instructions 3056 // and add predecessor instructions that can also be rematerialized. 3057 // This is actually a graph of instructions since we could potentially 3058 // have multiple uses of a def in the set of predecessor instructions. 3059 // The approach here is to maintain a graph of instructions for each bottom 3060 // level instruction - where we have a unique set of instructions (nodes) 3061 // and edges between them. We then walk the graph in reverse post-dominator 3062 // order to insert them past the suspend point, but ensure that ordering is 3063 // correct. We also rely on CSE removing duplicate defs for remats of 3064 // different instructions with a def in common (rather than maintaining more 3065 // complex graphs for each suspend point) 3066 3067 // We can do this by adding new nodes to the list for each suspend 3068 // point. Then using standard GraphTraits to give a reverse post-order 3069 // traversal when we insert the nodes after the suspend 3070 SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; 3071 for (auto &E : Spills) { 3072 for (Instruction *U : E.second) { 3073 // Don't process a user twice (this can happen if the instruction uses 3074 // more than one rematerializable def) 3075 if (AllRemats.count(U)) 3076 continue; 3077 3078 // Constructor creates the whole RematGraph for the given Use 3079 auto RematUPtr = 3080 std::make_unique<RematGraph>(MaterializableCallback, U, Checker); 3081 3082 LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; 3083 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); 3084 for (auto I = RPOT.begin(); I != RPOT.end(); 3085 ++I) { (*I)->Node->dump(); } dbgs() 3086 << "\n";); 3087 3088 AllRemats[U] = std::move(RematUPtr); 3089 } 3090 } 3091 3092 // Rewrite materializable instructions to be materialized at the use 3093 // point. 3094 LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); 3095 rewriteMaterializableInstructions(AllRemats); 3096 } 3097 3098 void coro::buildCoroutineFrame( 3099 Function &F, Shape &Shape, TargetTransformInfo &TTI, 3100 const std::function<bool(Instruction &)> &MaterializableCallback) { 3101 // Don't eliminate swifterror in async functions that won't be split. 3102 if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) 3103 eliminateSwiftError(F, Shape); 3104 3105 if (Shape.ABI == coro::ABI::Switch && 3106 Shape.SwitchLowering.PromiseAlloca) { 3107 Shape.getSwitchCoroId()->clearPromise(); 3108 } 3109 3110 // Make sure that all coro.save, coro.suspend and the fallthrough coro.end 3111 // intrinsics are in their own blocks to simplify the logic of building up 3112 // SuspendCrossing data. 3113 for (auto *CSI : Shape.CoroSuspends) { 3114 if (auto *Save = CSI->getCoroSave()) 3115 splitAround(Save, "CoroSave"); 3116 splitAround(CSI, "CoroSuspend"); 3117 } 3118 3119 // Put CoroEnds into their own blocks. 3120 for (AnyCoroEndInst *CE : Shape.CoroEnds) { 3121 splitAround(CE, "CoroEnd"); 3122 3123 // Emit the musttail call function in a new block before the CoroEnd. 3124 // We do this here so that the right suspend crossing info is computed for 3125 // the uses of the musttail call function call. (Arguments to the coro.end 3126 // instructions would be ignored) 3127 if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) { 3128 auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction(); 3129 if (!MustTailCallFn) 3130 continue; 3131 IRBuilder<> Builder(AsyncEnd); 3132 SmallVector<Value *, 8> Args(AsyncEnd->args()); 3133 auto Arguments = ArrayRef<Value *>(Args).drop_front(3); 3134 auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, 3135 TTI, Arguments, Builder); 3136 splitAround(Call, "MustTailCall.Before.CoroEnd"); 3137 } 3138 } 3139 3140 // Later code makes structural assumptions about single predecessors phis e.g 3141 // that they are not live across a suspend point. 3142 cleanupSinglePredPHIs(F); 3143 3144 // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will 3145 // never has its definition separated from the PHI by the suspend point. 3146 rewritePHIs(F); 3147 3148 // Build suspend crossing info. 3149 SuspendCrossingInfo Checker(F, Shape); 3150 3151 doRematerializations(F, Checker, MaterializableCallback); 3152 3153 const DominatorTree DT(F); 3154 FrameDataInfo FrameData; 3155 SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; 3156 SmallVector<Instruction*, 4> DeadInstructions; 3157 if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 3158 Shape.ABI != coro::ABI::RetconOnce) 3159 sinkLifetimeStartMarkers(F, Shape, Checker, DT); 3160 3161 // Collect the spills for arguments and other not-materializable values. 3162 for (Argument &A : F.args()) 3163 for (User *U : A.users()) 3164 if (Checker.isDefinitionAcrossSuspend(A, U)) 3165 FrameData.Spills[&A].push_back(cast<Instruction>(U)); 3166 3167 for (Instruction &I : instructions(F)) { 3168 // Values returned from coroutine structure intrinsics should not be part 3169 // of the Coroutine Frame. 3170 if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) 3171 continue; 3172 3173 // Handle alloca.alloc specially here. 3174 if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { 3175 // Check whether the alloca's lifetime is bounded by suspend points. 3176 if (isLocalAlloca(AI)) { 3177 LocalAllocas.push_back(AI); 3178 continue; 3179 } 3180 3181 // If not, do a quick rewrite of the alloca and then add spills of 3182 // the rewritten value. The rewrite doesn't invalidate anything in 3183 // Spills because the other alloca intrinsics have no other operands 3184 // besides AI, and it doesn't invalidate the iteration because we delay 3185 // erasing AI. 3186 auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); 3187 3188 for (User *U : Alloc->users()) { 3189 if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) 3190 FrameData.Spills[Alloc].push_back(cast<Instruction>(U)); 3191 } 3192 continue; 3193 } 3194 3195 // Ignore alloca.get; we process this as part of coro.alloca.alloc. 3196 if (isa<CoroAllocaGetInst>(I)) 3197 continue; 3198 3199 if (auto *AI = dyn_cast<AllocaInst>(&I)) { 3200 collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT); 3201 continue; 3202 } 3203 3204 for (User *U : I.users()) 3205 if (Checker.isDefinitionAcrossSuspend(I, U)) { 3206 // We cannot spill a token. 3207 if (I.getType()->isTokenTy()) 3208 report_fatal_error( 3209 "token definition is separated from the use by a suspend point"); 3210 FrameData.Spills[&I].push_back(cast<Instruction>(U)); 3211 } 3212 } 3213 3214 LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); 3215 3216 // We don't want the layout of coroutine frame to be affected 3217 // by debug information. So we only choose to salvage DbgValueInst for 3218 // whose value is already in the frame. 3219 // We would handle the dbg.values for allocas specially 3220 for (auto &Iter : FrameData.Spills) { 3221 auto *V = Iter.first; 3222 SmallVector<DbgValueInst *, 16> DVIs; 3223 SmallVector<DbgVariableRecord *, 16> DVRs; 3224 findDbgValues(DVIs, V, &DVRs); 3225 for (DbgValueInst *DVI : DVIs) 3226 if (Checker.isDefinitionAcrossSuspend(*V, DVI)) 3227 FrameData.Spills[V].push_back(DVI); 3228 // Add the instructions which carry debug info that is in the frame. 3229 for (DbgVariableRecord *DVR : DVRs) 3230 if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr)) 3231 FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr); 3232 } 3233 3234 LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills)); 3235 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 3236 Shape.ABI == coro::ABI::Async) 3237 sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin); 3238 Shape.FrameTy = buildFrameType(F, Shape, FrameData); 3239 Shape.FramePtr = Shape.CoroBegin; 3240 // For now, this works for C++ programs only. 3241 buildFrameDebugInfo(F, Shape, FrameData); 3242 insertSpills(FrameData, Shape); 3243 lowerLocalAllocas(LocalAllocas, DeadInstructions); 3244 3245 for (auto *I : DeadInstructions) 3246 I->eraseFromParent(); 3247 } 3248