xref: /llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp (revision dc12ccd13f98a3f3ec4af07e60f6fe1344965e17)
1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // This file contains classes used to discover if for a particular value
9 // there from sue to definition that crosses a suspend block.
10 //
11 // Using the information discovered we form a Coroutine Frame structure to
12 // contain those values. All uses of those values are replaced with appropriate
13 // GEP + load from the coroutine frame. At the point of the definition we spill
14 // the value into the coroutine frame.
15 //===----------------------------------------------------------------------===//
16 
17 #include "CoroInternal.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/Analysis/CFG.h"
23 #include "llvm/Analysis/PtrUseVisitor.h"
24 #include "llvm/Analysis/StackLifetime.h"
25 #include "llvm/Config/llvm-config.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/DIBuilder.h"
28 #include "llvm/IR/DebugInfo.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstIterator.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/MathExtras.h"
35 #include "llvm/Support/OptimizedStructLayout.h"
36 #include "llvm/Support/circular_raw_ostream.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
39 #include "llvm/Transforms/Utils/Local.h"
40 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
41 #include <algorithm>
42 #include <deque>
43 #include <optional>
44 
45 using namespace llvm;
46 
47 extern cl::opt<bool> UseNewDbgInfoFormat;
48 
49 // The "coro-suspend-crossing" flag is very noisy. There is another debug type,
50 // "coro-frame", which results in leaner debug spew.
51 #define DEBUG_TYPE "coro-suspend-crossing"
52 
53 enum { SmallVectorThreshold = 32 };
54 
55 // Provides two way mapping between the blocks and numbers.
56 namespace {
57 class BlockToIndexMapping {
58   SmallVector<BasicBlock *, SmallVectorThreshold> V;
59 
60 public:
61   size_t size() const { return V.size(); }
62 
63   BlockToIndexMapping(Function &F) {
64     for (BasicBlock &BB : F)
65       V.push_back(&BB);
66     llvm::sort(V);
67   }
68 
69   size_t blockToIndex(BasicBlock const *BB) const {
70     auto *I = llvm::lower_bound(V, BB);
71     assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block");
72     return I - V.begin();
73   }
74 
75   BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; }
76 };
77 } // end anonymous namespace
78 
79 // The SuspendCrossingInfo maintains data that allows to answer a question
80 // whether given two BasicBlocks A and B there is a path from A to B that
81 // passes through a suspend point.
82 //
83 // For every basic block 'i' it maintains a BlockData that consists of:
84 //   Consumes:  a bit vector which contains a set of indices of blocks that can
85 //              reach block 'i'. A block can trivially reach itself.
86 //   Kills: a bit vector which contains a set of indices of blocks that can
87 //          reach block 'i' but there is a path crossing a suspend point
88 //          not repeating 'i' (path to 'i' without cycles containing 'i').
89 //   Suspend: a boolean indicating whether block 'i' contains a suspend point.
90 //   End: a boolean indicating whether block 'i' contains a coro.end intrinsic.
91 //   KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that
92 //             crosses a suspend point.
93 //
94 namespace {
95 class SuspendCrossingInfo {
96   BlockToIndexMapping Mapping;
97 
98   struct BlockData {
99     BitVector Consumes;
100     BitVector Kills;
101     bool Suspend = false;
102     bool End = false;
103     bool KillLoop = false;
104     bool Changed = false;
105   };
106   SmallVector<BlockData, SmallVectorThreshold> Block;
107 
108   iterator_range<pred_iterator> predecessors(BlockData const &BD) const {
109     BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]);
110     return llvm::predecessors(BB);
111   }
112 
113   BlockData &getBlockData(BasicBlock *BB) {
114     return Block[Mapping.blockToIndex(BB)];
115   }
116 
117   /// Compute the BlockData for the current function in one iteration.
118   /// Initialize - Whether this is the first iteration, we can optimize
119   /// the initial case a little bit by manual loop switch.
120   /// Returns whether the BlockData changes in this iteration.
121   template <bool Initialize = false>
122   bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT);
123 
124 public:
125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
126   void dump() const;
127   void dump(StringRef Label, BitVector const &BV) const;
128 #endif
129 
130   SuspendCrossingInfo(Function &F, coro::Shape &Shape);
131 
132   /// Returns true if there is a path from \p From to \p To crossing a suspend
133   /// point without crossing \p From a 2nd time.
134   bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const {
135     size_t const FromIndex = Mapping.blockToIndex(From);
136     size_t const ToIndex = Mapping.blockToIndex(To);
137     bool const Result = Block[ToIndex].Kills[FromIndex];
138     LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName()
139                       << " answer is " << Result << "\n");
140     return Result;
141   }
142 
143   /// Returns true if there is a path from \p From to \p To crossing a suspend
144   /// point without crossing \p From a 2nd time. If \p From is the same as \p To
145   /// this will also check if there is a looping path crossing a suspend point.
146   bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From,
147                                          BasicBlock *To) const {
148     size_t const FromIndex = Mapping.blockToIndex(From);
149     size_t const ToIndex = Mapping.blockToIndex(To);
150     bool Result = Block[ToIndex].Kills[FromIndex] ||
151                   (From == To && Block[ToIndex].KillLoop);
152     LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName()
153                       << " answer is " << Result << " (path or loop)\n");
154     return Result;
155   }
156 
157   bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const {
158     auto *I = cast<Instruction>(U);
159 
160     // We rewrote PHINodes, so that only the ones with exactly one incoming
161     // value need to be analyzed.
162     if (auto *PN = dyn_cast<PHINode>(I))
163       if (PN->getNumIncomingValues() > 1)
164         return false;
165 
166     BasicBlock *UseBB = I->getParent();
167 
168     // As a special case, treat uses by an llvm.coro.suspend.retcon or an
169     // llvm.coro.suspend.async as if they were uses in the suspend's single
170     // predecessor: the uses conceptually occur before the suspend.
171     if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) {
172       UseBB = UseBB->getSinglePredecessor();
173       assert(UseBB && "should have split coro.suspend into its own block");
174     }
175 
176     return hasPathCrossingSuspendPoint(DefBB, UseBB);
177   }
178 
179   bool isDefinitionAcrossSuspend(Argument &A, User *U) const {
180     return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U);
181   }
182 
183   bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
184     auto *DefBB = I.getParent();
185 
186     // As a special case, treat values produced by an llvm.coro.suspend.*
187     // as if they were defined in the single successor: the uses
188     // conceptually occur after the suspend.
189     if (isa<AnyCoroSuspendInst>(I)) {
190       DefBB = DefBB->getSingleSuccessor();
191       assert(DefBB && "should have split coro.suspend into its own block");
192     }
193 
194     return isDefinitionAcrossSuspend(DefBB, U);
195   }
196 
197   bool isDefinitionAcrossSuspend(Value &V, User *U) const {
198     if (auto *Arg = dyn_cast<Argument>(&V))
199       return isDefinitionAcrossSuspend(*Arg, U);
200     if (auto *Inst = dyn_cast<Instruction>(&V))
201       return isDefinitionAcrossSuspend(*Inst, U);
202 
203     llvm_unreachable(
204         "Coroutine could only collect Argument and Instruction now.");
205   }
206 };
207 } // end anonymous namespace
208 
209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
210 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
211                                                 BitVector const &BV) const {
212   dbgs() << Label << ":";
213   for (size_t I = 0, N = BV.size(); I < N; ++I)
214     if (BV[I])
215       dbgs() << " " << Mapping.indexToBlock(I)->getName();
216   dbgs() << "\n";
217 }
218 
219 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
220   for (size_t I = 0, N = Block.size(); I < N; ++I) {
221     BasicBlock *const B = Mapping.indexToBlock(I);
222     dbgs() << B->getName() << ":\n";
223     dump("   Consumes", Block[I].Consumes);
224     dump("      Kills", Block[I].Kills);
225   }
226   dbgs() << "\n";
227 }
228 #endif
229 
230 template <bool Initialize>
231 bool SuspendCrossingInfo::computeBlockData(
232     const ReversePostOrderTraversal<Function *> &RPOT) {
233   bool Changed = false;
234 
235   for (const BasicBlock *BB : RPOT) {
236     auto BBNo = Mapping.blockToIndex(BB);
237     auto &B = Block[BBNo];
238 
239     // We don't need to count the predecessors when initialization.
240     if constexpr (!Initialize)
241       // If all the predecessors of the current Block don't change,
242       // the BlockData for the current block must not change too.
243       if (all_of(predecessors(B), [this](BasicBlock *BB) {
244             return !Block[Mapping.blockToIndex(BB)].Changed;
245           })) {
246         B.Changed = false;
247         continue;
248       }
249 
250     // Saved Consumes and Kills bitsets so that it is easy to see
251     // if anything changed after propagation.
252     auto SavedConsumes = B.Consumes;
253     auto SavedKills = B.Kills;
254 
255     for (BasicBlock *PI : predecessors(B)) {
256       auto PrevNo = Mapping.blockToIndex(PI);
257       auto &P = Block[PrevNo];
258 
259       // Propagate Kills and Consumes from predecessors into B.
260       B.Consumes |= P.Consumes;
261       B.Kills |= P.Kills;
262 
263       // If block P is a suspend block, it should propagate kills into block
264       // B for every block P consumes.
265       if (P.Suspend)
266         B.Kills |= P.Consumes;
267     }
268 
269     if (B.Suspend) {
270       // If block B is a suspend block, it should kill all of the blocks it
271       // consumes.
272       B.Kills |= B.Consumes;
273     } else if (B.End) {
274       // If block B is an end block, it should not propagate kills as the
275       // blocks following coro.end() are reached during initial invocation
276       // of the coroutine while all the data are still available on the
277       // stack or in the registers.
278       B.Kills.reset();
279     } else {
280       // This is reached when B block it not Suspend nor coro.end and it
281       // need to make sure that it is not in the kill set.
282       B.KillLoop |= B.Kills[BBNo];
283       B.Kills.reset(BBNo);
284     }
285 
286     if constexpr (!Initialize) {
287       B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes);
288       Changed |= B.Changed;
289     }
290   }
291 
292   return Changed;
293 }
294 
295 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
296     : Mapping(F) {
297   const size_t N = Mapping.size();
298   Block.resize(N);
299 
300   // Initialize every block so that it consumes itself
301   for (size_t I = 0; I < N; ++I) {
302     auto &B = Block[I];
303     B.Consumes.resize(N);
304     B.Kills.resize(N);
305     B.Consumes.set(I);
306     B.Changed = true;
307   }
308 
309   // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as
310   // the code beyond coro.end is reachable during initial invocation of the
311   // coroutine.
312   for (auto *CE : Shape.CoroEnds)
313     getBlockData(CE->getParent()).End = true;
314 
315   // Mark all suspend blocks and indicate that they kill everything they
316   // consume. Note, that crossing coro.save also requires a spill, as any code
317   // between coro.save and coro.suspend may resume the coroutine and all of the
318   // state needs to be saved by that time.
319   auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
320     BasicBlock *SuspendBlock = BarrierInst->getParent();
321     auto &B = getBlockData(SuspendBlock);
322     B.Suspend = true;
323     B.Kills |= B.Consumes;
324   };
325   for (auto *CSI : Shape.CoroSuspends) {
326     markSuspendBlock(CSI);
327     if (auto *Save = CSI->getCoroSave())
328       markSuspendBlock(Save);
329   }
330 
331   // It is considered to be faster to use RPO traversal for forward-edges
332   // dataflow analysis.
333   ReversePostOrderTraversal<Function *> RPOT(&F);
334   computeBlockData</*Initialize=*/true>(RPOT);
335   while (computeBlockData</*Initialize*/ false>(RPOT))
336     ;
337 
338   LLVM_DEBUG(dump());
339 }
340 
341 namespace {
342 
343 // RematGraph is used to construct a DAG for rematerializable instructions
344 // When the constructor is invoked with a candidate instruction (which is
345 // materializable) it builds a DAG of materializable instructions from that
346 // point.
347 // Typically, for each instruction identified as re-materializable across a
348 // suspend point, a RematGraph will be created.
349 struct RematGraph {
350   // Each RematNode in the graph contains the edges to instructions providing
351   // operands in the current node.
352   struct RematNode {
353     Instruction *Node;
354     SmallVector<RematNode *> Operands;
355     RematNode() = default;
356     RematNode(Instruction *V) : Node(V) {}
357   };
358 
359   RematNode *EntryNode;
360   using RematNodeMap =
361       SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
362   RematNodeMap Remats;
363   const std::function<bool(Instruction &)> &MaterializableCallback;
364   SuspendCrossingInfo &Checker;
365 
366   RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
367              Instruction *I, SuspendCrossingInfo &Checker)
368       : MaterializableCallback(MaterializableCallback), Checker(Checker) {
369     std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
370     EntryNode = FirstNode.get();
371     std::deque<std::unique_ptr<RematNode>> WorkList;
372     addNode(std::move(FirstNode), WorkList, cast<User>(I));
373     while (WorkList.size()) {
374       std::unique_ptr<RematNode> N = std::move(WorkList.front());
375       WorkList.pop_front();
376       addNode(std::move(N), WorkList, cast<User>(I));
377     }
378   }
379 
380   void addNode(std::unique_ptr<RematNode> NUPtr,
381                std::deque<std::unique_ptr<RematNode>> &WorkList,
382                User *FirstUse) {
383     RematNode *N = NUPtr.get();
384     if (Remats.count(N->Node))
385       return;
386 
387     // We haven't see this node yet - add to the list
388     Remats[N->Node] = std::move(NUPtr);
389     for (auto &Def : N->Node->operands()) {
390       Instruction *D = dyn_cast<Instruction>(Def.get());
391       if (!D || !MaterializableCallback(*D) ||
392           !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
393         continue;
394 
395       if (Remats.count(D)) {
396         // Already have this in the graph
397         N->Operands.push_back(Remats[D].get());
398         continue;
399       }
400 
401       bool NoMatch = true;
402       for (auto &I : WorkList) {
403         if (I->Node == D) {
404           NoMatch = false;
405           N->Operands.push_back(I.get());
406           break;
407         }
408       }
409       if (NoMatch) {
410         // Create a new node
411         std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
412         N->Operands.push_back(ChildNode.get());
413         WorkList.push_back(std::move(ChildNode));
414       }
415     }
416   }
417 
418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
419   void dump() const {
420     dbgs() << "Entry (";
421     if (EntryNode->Node->getParent()->hasName())
422       dbgs() << EntryNode->Node->getParent()->getName();
423     else
424       EntryNode->Node->getParent()->printAsOperand(dbgs(), false);
425     dbgs() << ") : " << *EntryNode->Node << "\n";
426     for (auto &E : Remats) {
427       dbgs() << *(E.first) << "\n";
428       for (RematNode *U : E.second->Operands)
429         dbgs() << "  " << *U->Node << "\n";
430     }
431   }
432 #endif
433 };
434 } // end anonymous namespace
435 
436 namespace llvm {
437 
438 template <> struct GraphTraits<RematGraph *> {
439   using NodeRef = RematGraph::RematNode *;
440   using ChildIteratorType = RematGraph::RematNode **;
441 
442   static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
443   static ChildIteratorType child_begin(NodeRef N) {
444     return N->Operands.begin();
445   }
446   static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
447 };
448 
449 } // end namespace llvm
450 
451 #undef DEBUG_TYPE // "coro-suspend-crossing"
452 #define DEBUG_TYPE "coro-frame"
453 
454 namespace {
455 class FrameTypeBuilder;
456 // Mapping from the to-be-spilled value to all the users that need reload.
457 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
458 struct AllocaInfo {
459   AllocaInst *Alloca;
460   DenseMap<Instruction *, std::optional<APInt>> Aliases;
461   bool MayWriteBeforeCoroBegin;
462   AllocaInfo(AllocaInst *Alloca,
463              DenseMap<Instruction *, std::optional<APInt>> Aliases,
464              bool MayWriteBeforeCoroBegin)
465       : Alloca(Alloca), Aliases(std::move(Aliases)),
466         MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
467 };
468 struct FrameDataInfo {
469   // All the values (that are not allocas) that needs to be spilled to the
470   // frame.
471   SpillInfo Spills;
472   // Allocas contains all values defined as allocas that need to live in the
473   // frame.
474   SmallVector<AllocaInfo, 8> Allocas;
475 
476   SmallVector<Value *, 8> getAllDefs() const {
477     SmallVector<Value *, 8> Defs;
478     for (const auto &P : Spills)
479       Defs.push_back(P.first);
480     for (const auto &A : Allocas)
481       Defs.push_back(A.Alloca);
482     return Defs;
483   }
484 
485   uint32_t getFieldIndex(Value *V) const {
486     auto Itr = FieldIndexMap.find(V);
487     assert(Itr != FieldIndexMap.end() &&
488            "Value does not have a frame field index");
489     return Itr->second;
490   }
491 
492   void setFieldIndex(Value *V, uint32_t Index) {
493     assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) &&
494            "Cannot set the index for the same field twice.");
495     FieldIndexMap[V] = Index;
496   }
497 
498   Align getAlign(Value *V) const {
499     auto Iter = FieldAlignMap.find(V);
500     assert(Iter != FieldAlignMap.end());
501     return Iter->second;
502   }
503 
504   void setAlign(Value *V, Align AL) {
505     assert(FieldAlignMap.count(V) == 0);
506     FieldAlignMap.insert({V, AL});
507   }
508 
509   uint64_t getDynamicAlign(Value *V) const {
510     auto Iter = FieldDynamicAlignMap.find(V);
511     assert(Iter != FieldDynamicAlignMap.end());
512     return Iter->second;
513   }
514 
515   void setDynamicAlign(Value *V, uint64_t Align) {
516     assert(FieldDynamicAlignMap.count(V) == 0);
517     FieldDynamicAlignMap.insert({V, Align});
518   }
519 
520   uint64_t getOffset(Value *V) const {
521     auto Iter = FieldOffsetMap.find(V);
522     assert(Iter != FieldOffsetMap.end());
523     return Iter->second;
524   }
525 
526   void setOffset(Value *V, uint64_t Offset) {
527     assert(FieldOffsetMap.count(V) == 0);
528     FieldOffsetMap.insert({V, Offset});
529   }
530 
531   // Remap the index of every field in the frame, using the final layout index.
532   void updateLayoutIndex(FrameTypeBuilder &B);
533 
534 private:
535   // LayoutIndexUpdateStarted is used to avoid updating the index of any field
536   // twice by mistake.
537   bool LayoutIndexUpdateStarted = false;
538   // Map from values to their slot indexes on the frame. They will be first set
539   // with their original insertion field index. After the frame is built, their
540   // indexes will be updated into the final layout index.
541   DenseMap<Value *, uint32_t> FieldIndexMap;
542   // Map from values to their alignment on the frame. They would be set after
543   // the frame is built.
544   DenseMap<Value *, Align> FieldAlignMap;
545   DenseMap<Value *, uint64_t> FieldDynamicAlignMap;
546   // Map from values to their offset on the frame. They would be set after
547   // the frame is built.
548   DenseMap<Value *, uint64_t> FieldOffsetMap;
549 };
550 } // namespace
551 
552 #ifndef NDEBUG
553 static void dumpSpills(StringRef Title, const SpillInfo &Spills) {
554   dbgs() << "------------- " << Title << "--------------\n";
555   for (const auto &E : Spills) {
556     E.first->dump();
557     dbgs() << "   user: ";
558     for (auto *I : E.second)
559       I->dump();
560   }
561 }
562 static void dumpRemats(
563     StringRef Title,
564     const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
565   dbgs() << "------------- " << Title << "--------------\n";
566   for (const auto &E : RM) {
567     E.second->dump();
568     dbgs() << "--\n";
569   }
570 }
571 
572 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) {
573   dbgs() << "------------- Allocas --------------\n";
574   for (const auto &A : Allocas) {
575     A.Alloca->dump();
576   }
577 }
578 #endif
579 
580 namespace {
581 using FieldIDType = size_t;
582 // We cannot rely solely on natural alignment of a type when building a
583 // coroutine frame and if the alignment specified on the Alloca instruction
584 // differs from the natural alignment of the alloca type we will need to insert
585 // padding.
586 class FrameTypeBuilder {
587 private:
588   struct Field {
589     uint64_t Size;
590     uint64_t Offset;
591     Type *Ty;
592     FieldIDType LayoutFieldIndex;
593     Align Alignment;
594     Align TyAlignment;
595     uint64_t DynamicAlignBuffer;
596   };
597 
598   const DataLayout &DL;
599   LLVMContext &Context;
600   uint64_t StructSize = 0;
601   Align StructAlign;
602   bool IsFinished = false;
603 
604   std::optional<Align> MaxFrameAlignment;
605 
606   SmallVector<Field, 8> Fields;
607   DenseMap<Value*, unsigned> FieldIndexByKey;
608 
609 public:
610   FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL,
611                    std::optional<Align> MaxFrameAlignment)
612       : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {}
613 
614   /// Add a field to this structure for the storage of an `alloca`
615   /// instruction.
616   [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI,
617                                               bool IsHeader = false) {
618     Type *Ty = AI->getAllocatedType();
619 
620     // Make an array type if this is a static array allocation.
621     if (AI->isArrayAllocation()) {
622       if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
623         Ty = ArrayType::get(Ty, CI->getValue().getZExtValue());
624       else
625         report_fatal_error("Coroutines cannot handle non static allocas yet");
626     }
627 
628     return addField(Ty, AI->getAlign(), IsHeader);
629   }
630 
631   /// We want to put the allocas whose lifetime-ranges are not overlapped
632   /// into one slot of coroutine frame.
633   /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566
634   ///
635   ///     cppcoro::task<void> alternative_paths(bool cond) {
636   ///         if (cond) {
637   ///             big_structure a;
638   ///             process(a);
639   ///             co_await something();
640   ///         } else {
641   ///             big_structure b;
642   ///             process2(b);
643   ///             co_await something();
644   ///         }
645   ///     }
646   ///
647   /// We want to put variable a and variable b in the same slot to
648   /// reduce the size of coroutine frame.
649   ///
650   /// This function use StackLifetime algorithm to partition the AllocaInsts in
651   /// Spills to non-overlapped sets in order to put Alloca in the same
652   /// non-overlapped set into the same slot in the Coroutine Frame. Then add
653   /// field for the allocas in the same non-overlapped set by using the largest
654   /// type as the field type.
655   ///
656   /// Side Effects: Because We sort the allocas, the order of allocas in the
657   /// frame may be different with the order in the source code.
658   void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData,
659                           coro::Shape &Shape);
660 
661   /// Add a field to this structure.
662   [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment,
663                                      bool IsHeader = false,
664                                      bool IsSpillOfValue = false) {
665     assert(!IsFinished && "adding fields to a finished builder");
666     assert(Ty && "must provide a type for a field");
667 
668     // The field size is always the alloc size of the type.
669     uint64_t FieldSize = DL.getTypeAllocSize(Ty);
670 
671     // For an alloca with size=0, we don't need to add a field and they
672     // can just point to any index in the frame. Use index 0.
673     if (FieldSize == 0) {
674       return 0;
675     }
676 
677     // The field alignment might not be the type alignment, but we need
678     // to remember the type alignment anyway to build the type.
679     // If we are spilling values we don't need to worry about ABI alignment
680     // concerns.
681     Align ABIAlign = DL.getABITypeAlign(Ty);
682     Align TyAlignment = ABIAlign;
683     if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign)
684       TyAlignment = *MaxFrameAlignment;
685     Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment);
686 
687     // The field alignment could be bigger than the max frame case, in that case
688     // we request additional storage to be able to dynamically align the
689     // pointer.
690     uint64_t DynamicAlignBuffer = 0;
691     if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) {
692       DynamicAlignBuffer =
693           offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment);
694       FieldAlignment = *MaxFrameAlignment;
695       FieldSize = FieldSize + DynamicAlignBuffer;
696     }
697 
698     // Lay out header fields immediately.
699     uint64_t Offset;
700     if (IsHeader) {
701       Offset = alignTo(StructSize, FieldAlignment);
702       StructSize = Offset + FieldSize;
703 
704       // Everything else has a flexible offset.
705     } else {
706       Offset = OptimizedStructLayoutField::FlexibleOffset;
707     }
708 
709     Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment,
710                       DynamicAlignBuffer});
711     return Fields.size() - 1;
712   }
713 
714   /// Finish the layout and set the body on the given type.
715   void finish(StructType *Ty);
716 
717   uint64_t getStructSize() const {
718     assert(IsFinished && "not yet finished!");
719     return StructSize;
720   }
721 
722   Align getStructAlign() const {
723     assert(IsFinished && "not yet finished!");
724     return StructAlign;
725   }
726 
727   FieldIDType getLayoutFieldIndex(FieldIDType Id) const {
728     assert(IsFinished && "not yet finished!");
729     return Fields[Id].LayoutFieldIndex;
730   }
731 
732   Field getLayoutField(FieldIDType Id) const {
733     assert(IsFinished && "not yet finished!");
734     return Fields[Id];
735   }
736 };
737 } // namespace
738 
739 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) {
740   auto Updater = [&](Value *I) {
741     auto Field = B.getLayoutField(getFieldIndex(I));
742     setFieldIndex(I, Field.LayoutFieldIndex);
743     setAlign(I, Field.Alignment);
744     uint64_t dynamicAlign =
745         Field.DynamicAlignBuffer
746             ? Field.DynamicAlignBuffer + Field.Alignment.value()
747             : 0;
748     setDynamicAlign(I, dynamicAlign);
749     setOffset(I, Field.Offset);
750   };
751   LayoutIndexUpdateStarted = true;
752   for (auto &S : Spills)
753     Updater(S.first);
754   for (const auto &A : Allocas)
755     Updater(A.Alloca);
756   LayoutIndexUpdateStarted = false;
757 }
758 
759 void FrameTypeBuilder::addFieldForAllocas(const Function &F,
760                                           FrameDataInfo &FrameData,
761                                           coro::Shape &Shape) {
762   using AllocaSetType = SmallVector<AllocaInst *, 4>;
763   SmallVector<AllocaSetType, 4> NonOverlapedAllocas;
764 
765   // We need to add field for allocas at the end of this function.
766   auto AddFieldForAllocasAtExit = make_scope_exit([&]() {
767     for (auto AllocaList : NonOverlapedAllocas) {
768       auto *LargestAI = *AllocaList.begin();
769       FieldIDType Id = addFieldForAlloca(LargestAI);
770       for (auto *Alloca : AllocaList)
771         FrameData.setFieldIndex(Alloca, Id);
772     }
773   });
774 
775   if (!Shape.OptimizeFrame) {
776     for (const auto &A : FrameData.Allocas) {
777       AllocaInst *Alloca = A.Alloca;
778       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
779     }
780     return;
781   }
782 
783   // Because there are paths from the lifetime.start to coro.end
784   // for each alloca, the liferanges for every alloca is overlaped
785   // in the blocks who contain coro.end and the successor blocks.
786   // So we choose to skip there blocks when we calculate the liferange
787   // for each alloca. It should be reasonable since there shouldn't be uses
788   // in these blocks and the coroutine frame shouldn't be used outside the
789   // coroutine body.
790   //
791   // Note that the user of coro.suspend may not be SwitchInst. However, this
792   // case seems too complex to handle. And it is harmless to skip these
793   // patterns since it just prevend putting the allocas to live in the same
794   // slot.
795   DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest;
796   for (auto *CoroSuspendInst : Shape.CoroSuspends) {
797     for (auto *U : CoroSuspendInst->users()) {
798       if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) {
799         auto *SWI = const_cast<SwitchInst *>(ConstSWI);
800         DefaultSuspendDest[SWI] = SWI->getDefaultDest();
801         SWI->setDefaultDest(SWI->getSuccessor(1));
802       }
803     }
804   }
805 
806   auto ExtractAllocas = [&]() {
807     AllocaSetType Allocas;
808     Allocas.reserve(FrameData.Allocas.size());
809     for (const auto &A : FrameData.Allocas)
810       Allocas.push_back(A.Alloca);
811     return Allocas;
812   };
813   StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(),
814                                       StackLifetime::LivenessType::May);
815   StackLifetimeAnalyzer.run();
816   auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) {
817     return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps(
818         StackLifetimeAnalyzer.getLiveRange(AI2));
819   };
820   auto GetAllocaSize = [&](const AllocaInfo &A) {
821     std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL);
822     assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n");
823     assert(!RetSize->isScalable() && "Scalable vectors are not yet supported");
824     return RetSize->getFixedValue();
825   };
826   // Put larger allocas in the front. So the larger allocas have higher
827   // priority to merge, which can save more space potentially. Also each
828   // AllocaSet would be ordered. So we can get the largest Alloca in one
829   // AllocaSet easily.
830   sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) {
831     return GetAllocaSize(Iter1) > GetAllocaSize(Iter2);
832   });
833   for (const auto &A : FrameData.Allocas) {
834     AllocaInst *Alloca = A.Alloca;
835     bool Merged = false;
836     // Try to find if the Alloca is not inferenced with any existing
837     // NonOverlappedAllocaSet. If it is true, insert the alloca to that
838     // NonOverlappedAllocaSet.
839     for (auto &AllocaSet : NonOverlapedAllocas) {
840       assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n");
841       bool NoInference = none_of(AllocaSet, [&](auto Iter) {
842         return IsAllocaInferenre(Alloca, Iter);
843       });
844       // If the alignment of A is multiple of the alignment of B, the address
845       // of A should satisfy the requirement for aligning for B.
846       //
847       // There may be other more fine-grained strategies to handle the alignment
848       // infomation during the merging process. But it seems hard to handle
849       // these strategies and benefit little.
850       bool Alignable = [&]() -> bool {
851         auto *LargestAlloca = *AllocaSet.begin();
852         return LargestAlloca->getAlign().value() % Alloca->getAlign().value() ==
853                0;
854       }();
855       bool CouldMerge = NoInference && Alignable;
856       if (!CouldMerge)
857         continue;
858       AllocaSet.push_back(Alloca);
859       Merged = true;
860       break;
861     }
862     if (!Merged) {
863       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
864     }
865   }
866   // Recover the default target destination for each Switch statement
867   // reserved.
868   for (auto SwitchAndDefaultDest : DefaultSuspendDest) {
869     SwitchInst *SWI = SwitchAndDefaultDest.first;
870     BasicBlock *DestBB = SwitchAndDefaultDest.second;
871     SWI->setDefaultDest(DestBB);
872   }
873   // This Debug Info could tell us which allocas are merged into one slot.
874   LLVM_DEBUG(for (auto &AllocaSet
875                   : NonOverlapedAllocas) {
876     if (AllocaSet.size() > 1) {
877       dbgs() << "In Function:" << F.getName() << "\n";
878       dbgs() << "Find Union Set "
879              << "\n";
880       dbgs() << "\tAllocas are \n";
881       for (auto Alloca : AllocaSet)
882         dbgs() << "\t\t" << *Alloca << "\n";
883     }
884   });
885 }
886 
887 void FrameTypeBuilder::finish(StructType *Ty) {
888   assert(!IsFinished && "already finished!");
889 
890   // Prepare the optimal-layout field array.
891   // The Id in the layout field is a pointer to our Field for it.
892   SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
893   LayoutFields.reserve(Fields.size());
894   for (auto &Field : Fields) {
895     LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment,
896                               Field.Offset);
897   }
898 
899   // Perform layout.
900   auto SizeAndAlign = performOptimizedStructLayout(LayoutFields);
901   StructSize = SizeAndAlign.first;
902   StructAlign = SizeAndAlign.second;
903 
904   auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & {
905     return *static_cast<Field *>(const_cast<void*>(LayoutField.Id));
906   };
907 
908   // We need to produce a packed struct type if there's a field whose
909   // assigned offset isn't a multiple of its natural type alignment.
910   bool Packed = [&] {
911     for (auto &LayoutField : LayoutFields) {
912       auto &F = getField(LayoutField);
913       if (!isAligned(F.TyAlignment, LayoutField.Offset))
914         return true;
915     }
916     return false;
917   }();
918 
919   // Build the struct body.
920   SmallVector<Type*, 16> FieldTypes;
921   FieldTypes.reserve(LayoutFields.size() * 3 / 2);
922   uint64_t LastOffset = 0;
923   for (auto &LayoutField : LayoutFields) {
924     auto &F = getField(LayoutField);
925 
926     auto Offset = LayoutField.Offset;
927 
928     // Add a padding field if there's a padding gap and we're either
929     // building a packed struct or the padding gap is more than we'd
930     // get from aligning to the field type's natural alignment.
931     assert(Offset >= LastOffset);
932     if (Offset != LastOffset) {
933       if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset)
934         FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context),
935                                             Offset - LastOffset));
936     }
937 
938     F.Offset = Offset;
939     F.LayoutFieldIndex = FieldTypes.size();
940 
941     FieldTypes.push_back(F.Ty);
942     if (F.DynamicAlignBuffer) {
943       FieldTypes.push_back(
944           ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer));
945     }
946     LastOffset = Offset + F.Size;
947   }
948 
949   Ty->setBody(FieldTypes, Packed);
950 
951 #ifndef NDEBUG
952   // Check that the IR layout matches the offsets we expect.
953   auto Layout = DL.getStructLayout(Ty);
954   for (auto &F : Fields) {
955     assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty);
956     assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset);
957   }
958 #endif
959 
960   IsFinished = true;
961 }
962 
963 static void cacheDIVar(FrameDataInfo &FrameData,
964                        DenseMap<Value *, DILocalVariable *> &DIVarCache) {
965   for (auto *V : FrameData.getAllDefs()) {
966     if (DIVarCache.contains(V))
967       continue;
968 
969     auto CacheIt = [&DIVarCache, V](const auto &Container) {
970       auto *I = llvm::find_if(Container, [](auto *DDI) {
971         return DDI->getExpression()->getNumElements() == 0;
972       });
973       if (I != Container.end())
974         DIVarCache.insert({V, (*I)->getVariable()});
975     };
976     CacheIt(findDbgDeclares(V));
977     CacheIt(findDVRDeclares(V));
978   }
979 }
980 
981 /// Create name for Type. It uses MDString to store new created string to
982 /// avoid memory leak.
983 static StringRef solveTypeName(Type *Ty) {
984   if (Ty->isIntegerTy()) {
985     // The longest name in common may be '__int_128', which has 9 bits.
986     SmallString<16> Buffer;
987     raw_svector_ostream OS(Buffer);
988     OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth();
989     auto *MDName = MDString::get(Ty->getContext(), OS.str());
990     return MDName->getString();
991   }
992 
993   if (Ty->isFloatingPointTy()) {
994     if (Ty->isFloatTy())
995       return "__float_";
996     if (Ty->isDoubleTy())
997       return "__double_";
998     return "__floating_type_";
999   }
1000 
1001   if (Ty->isPointerTy())
1002     return "PointerType";
1003 
1004   if (Ty->isStructTy()) {
1005     if (!cast<StructType>(Ty)->hasName())
1006       return "__LiteralStructType_";
1007 
1008     auto Name = Ty->getStructName();
1009 
1010     SmallString<16> Buffer(Name);
1011     for (auto &Iter : Buffer)
1012       if (Iter == '.' || Iter == ':')
1013         Iter = '_';
1014     auto *MDName = MDString::get(Ty->getContext(), Buffer.str());
1015     return MDName->getString();
1016   }
1017 
1018   return "UnknownType";
1019 }
1020 
1021 static DIType *solveDIType(DIBuilder &Builder, Type *Ty,
1022                            const DataLayout &Layout, DIScope *Scope,
1023                            unsigned LineNum,
1024                            DenseMap<Type *, DIType *> &DITypeCache) {
1025   if (DIType *DT = DITypeCache.lookup(Ty))
1026     return DT;
1027 
1028   StringRef Name = solveTypeName(Ty);
1029 
1030   DIType *RetType = nullptr;
1031 
1032   if (Ty->isIntegerTy()) {
1033     auto BitWidth = cast<IntegerType>(Ty)->getBitWidth();
1034     RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed,
1035                                       llvm::DINode::FlagArtificial);
1036   } else if (Ty->isFloatingPointTy()) {
1037     RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty),
1038                                       dwarf::DW_ATE_float,
1039                                       llvm::DINode::FlagArtificial);
1040   } else if (Ty->isPointerTy()) {
1041     // Construct PointerType points to null (aka void *) instead of exploring
1042     // pointee type to avoid infinite search problem. For example, we would be
1043     // in trouble if we traverse recursively:
1044     //
1045     //  struct Node {
1046     //      Node* ptr;
1047     //  };
1048     RetType =
1049         Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty),
1050                                   Layout.getABITypeAlign(Ty).value() * CHAR_BIT,
1051                                   /*DWARFAddressSpace=*/std::nullopt, Name);
1052   } else if (Ty->isStructTy()) {
1053     auto *DIStruct = Builder.createStructType(
1054         Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty),
1055         Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT,
1056         llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray());
1057 
1058     auto *StructTy = cast<StructType>(Ty);
1059     SmallVector<Metadata *, 16> Elements;
1060     for (unsigned I = 0; I < StructTy->getNumElements(); I++) {
1061       DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout,
1062                                  Scope, LineNum, DITypeCache);
1063       assert(DITy);
1064       Elements.push_back(Builder.createMemberType(
1065           Scope, DITy->getName(), Scope->getFile(), LineNum,
1066           DITy->getSizeInBits(), DITy->getAlignInBits(),
1067           Layout.getStructLayout(StructTy)->getElementOffsetInBits(I),
1068           llvm::DINode::FlagArtificial, DITy));
1069     }
1070 
1071     Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements));
1072 
1073     RetType = DIStruct;
1074   } else {
1075     LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n");
1076     TypeSize Size = Layout.getTypeSizeInBits(Ty);
1077     auto *CharSizeType = Builder.createBasicType(
1078         Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial);
1079 
1080     if (Size <= 8)
1081       RetType = CharSizeType;
1082     else {
1083       if (Size % 8 != 0)
1084         Size = TypeSize::getFixed(Size + 8 - (Size % 8));
1085 
1086       RetType = Builder.createArrayType(
1087           Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType,
1088           Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8)));
1089     }
1090   }
1091 
1092   DITypeCache.insert({Ty, RetType});
1093   return RetType;
1094 }
1095 
1096 /// Build artificial debug info for C++ coroutine frames to allow users to
1097 /// inspect the contents of the frame directly
1098 ///
1099 /// Create Debug information for coroutine frame with debug name "__coro_frame".
1100 /// The debug information for the fields of coroutine frame is constructed from
1101 /// the following way:
1102 /// 1. For all the value in the Frame, we search the use of dbg.declare to find
1103 ///    the corresponding debug variables for the value. If we can find the
1104 ///    debug variable, we can get full and accurate debug information.
1105 /// 2. If we can't get debug information in step 1 and 2, we could only try to
1106 ///    build the DIType by Type. We did this in solveDIType. We only handle
1107 ///    integer, float, double, integer type and struct type for now.
1108 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
1109                                 FrameDataInfo &FrameData) {
1110   DISubprogram *DIS = F.getSubprogram();
1111   // If there is no DISubprogram for F, it implies the Function are not compiled
1112   // with debug info. So we also don't need to generate debug info for the frame
1113   // neither.
1114   if (!DIS || !DIS->getUnit() ||
1115       !dwarf::isCPlusPlus(
1116           (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage()))
1117     return;
1118 
1119   assert(Shape.ABI == coro::ABI::Switch &&
1120          "We could only build debug infomation for C++ coroutine now.\n");
1121 
1122   DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false);
1123 
1124   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
1125   assert(PromiseAlloca &&
1126          "Coroutine with switch ABI should own Promise alloca");
1127 
1128   DIFile *DFile = DIS->getFile();
1129   unsigned LineNum = DIS->getLine();
1130 
1131   DICompositeType *FrameDITy = DBuilder.createStructType(
1132       DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(),
1133       DFile, LineNum, Shape.FrameSize * 8,
1134       Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr,
1135       llvm::DINodeArray());
1136   StructType *FrameTy = Shape.FrameTy;
1137   SmallVector<Metadata *, 16> Elements;
1138   DataLayout Layout = F.getDataLayout();
1139 
1140   DenseMap<Value *, DILocalVariable *> DIVarCache;
1141   cacheDIVar(FrameData, DIVarCache);
1142 
1143   unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume;
1144   unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy;
1145   unsigned IndexIndex = Shape.SwitchLowering.IndexField;
1146 
1147   DenseMap<unsigned, StringRef> NameCache;
1148   NameCache.insert({ResumeIndex, "__resume_fn"});
1149   NameCache.insert({DestroyIndex, "__destroy_fn"});
1150   NameCache.insert({IndexIndex, "__coro_index"});
1151 
1152   Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex),
1153        *DestroyFnTy = FrameTy->getElementType(DestroyIndex),
1154        *IndexTy = FrameTy->getElementType(IndexIndex);
1155 
1156   DenseMap<unsigned, DIType *> TyCache;
1157   TyCache.insert(
1158       {ResumeIndex, DBuilder.createPointerType(
1159                         nullptr, Layout.getTypeSizeInBits(ResumeFnTy))});
1160   TyCache.insert(
1161       {DestroyIndex, DBuilder.createPointerType(
1162                          nullptr, Layout.getTypeSizeInBits(DestroyFnTy))});
1163 
1164   /// FIXME: If we fill the field `SizeInBits` with the actual size of
1165   /// __coro_index in bits, then __coro_index wouldn't show in the debugger.
1166   TyCache.insert({IndexIndex, DBuilder.createBasicType(
1167                                   "__coro_index",
1168                                   (Layout.getTypeSizeInBits(IndexTy) < 8)
1169                                       ? 8
1170                                       : Layout.getTypeSizeInBits(IndexTy),
1171                                   dwarf::DW_ATE_unsigned_char)});
1172 
1173   for (auto *V : FrameData.getAllDefs()) {
1174     if (!DIVarCache.contains(V))
1175       continue;
1176 
1177     auto Index = FrameData.getFieldIndex(V);
1178 
1179     NameCache.insert({Index, DIVarCache[V]->getName()});
1180     TyCache.insert({Index, DIVarCache[V]->getType()});
1181   }
1182 
1183   // Cache from index to (Align, Offset Pair)
1184   DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache;
1185   // The Align and Offset of Resume function and Destroy function are fixed.
1186   OffsetCache.insert({ResumeIndex, {8, 0}});
1187   OffsetCache.insert({DestroyIndex, {8, 8}});
1188   OffsetCache.insert(
1189       {IndexIndex,
1190        {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}});
1191 
1192   for (auto *V : FrameData.getAllDefs()) {
1193     auto Index = FrameData.getFieldIndex(V);
1194 
1195     OffsetCache.insert(
1196         {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}});
1197   }
1198 
1199   DenseMap<Type *, DIType *> DITypeCache;
1200   // This counter is used to avoid same type names. e.g., there would be
1201   // many i32 and i64 types in one coroutine. And we would use i32_0 and
1202   // i32_1 to avoid the same type. Since it makes no sense the name of the
1203   // fields confilicts with each other.
1204   unsigned UnknownTypeNum = 0;
1205   for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) {
1206     if (!OffsetCache.contains(Index))
1207       continue;
1208 
1209     std::string Name;
1210     uint64_t SizeInBits;
1211     uint32_t AlignInBits;
1212     uint64_t OffsetInBits;
1213     DIType *DITy = nullptr;
1214 
1215     Type *Ty = FrameTy->getElementType(Index);
1216     assert(Ty->isSized() && "We can't handle type which is not sized.\n");
1217     SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue();
1218     AlignInBits = OffsetCache[Index].first * 8;
1219     OffsetInBits = OffsetCache[Index].second * 8;
1220 
1221     if (NameCache.contains(Index)) {
1222       Name = NameCache[Index].str();
1223       DITy = TyCache[Index];
1224     } else {
1225       DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache);
1226       assert(DITy && "SolveDIType shouldn't return nullptr.\n");
1227       Name = DITy->getName().str();
1228       Name += "_" + std::to_string(UnknownTypeNum);
1229       UnknownTypeNum++;
1230     }
1231 
1232     Elements.push_back(DBuilder.createMemberType(
1233         FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits,
1234         llvm::DINode::FlagArtificial, DITy));
1235   }
1236 
1237   DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements));
1238 
1239   auto *FrameDIVar =
1240       DBuilder.createAutoVariable(DIS, "__coro_frame", DFile, LineNum,
1241                                   FrameDITy, true, DINode::FlagArtificial);
1242 
1243   // Subprogram would have ContainedNodes field which records the debug
1244   // variables it contained. So we need to add __coro_frame to the
1245   // ContainedNodes of it.
1246   //
1247   // If we don't add __coro_frame to the RetainedNodes, user may get
1248   // `no symbol __coro_frame in context` rather than `__coro_frame`
1249   // is optimized out, which is more precise.
1250   auto RetainedNodes = DIS->getRetainedNodes();
1251   SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(),
1252                                                RetainedNodes.end());
1253   RetainedNodesVec.push_back(FrameDIVar);
1254   DIS->replaceOperandWith(7, (MDTuple::get(F.getContext(), RetainedNodesVec)));
1255 
1256   // Construct the location for the frame debug variable. The column number
1257   // is fake but it should be fine.
1258   DILocation *DILoc =
1259       DILocation::get(DIS->getContext(), LineNum, /*Column=*/1, DIS);
1260   assert(FrameDIVar->isValidLocationForIntrinsic(DILoc));
1261 
1262   if (UseNewDbgInfoFormat) {
1263     DbgVariableRecord *NewDVR =
1264         new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
1265                               DBuilder.createExpression(), DILoc,
1266                               DbgVariableRecord::LocationType::Declare);
1267     BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
1268     It->getParent()->insertDbgRecordBefore(NewDVR, It);
1269   } else {
1270     DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
1271                            DBuilder.createExpression(), DILoc,
1272                            &*Shape.getInsertPtAfterFramePtr());
1273   }
1274 }
1275 
1276 // Build a struct that will keep state for an active coroutine.
1277 //   struct f.frame {
1278 //     ResumeFnTy ResumeFnAddr;
1279 //     ResumeFnTy DestroyFnAddr;
1280 //     ... promise (if present) ...
1281 //     int ResumeIndex;
1282 //     ... spills ...
1283 //   };
1284 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
1285                                   FrameDataInfo &FrameData) {
1286   LLVMContext &C = F.getContext();
1287   const DataLayout &DL = F.getDataLayout();
1288   StructType *FrameTy = [&] {
1289     SmallString<32> Name(F.getName());
1290     Name.append(".Frame");
1291     return StructType::create(C, Name);
1292   }();
1293 
1294   // We will use this value to cap the alignment of spilled values.
1295   std::optional<Align> MaxFrameAlignment;
1296   if (Shape.ABI == coro::ABI::Async)
1297     MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment();
1298   FrameTypeBuilder B(C, DL, MaxFrameAlignment);
1299 
1300   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
1301   std::optional<FieldIDType> SwitchIndexFieldId;
1302 
1303   if (Shape.ABI == coro::ABI::Switch) {
1304     auto *FnPtrTy = PointerType::getUnqual(C);
1305 
1306     // Add header fields for the resume and destroy functions.
1307     // We can rely on these being perfectly packed.
1308     (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true);
1309     (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true);
1310 
1311     // PromiseAlloca field needs to be explicitly added here because it's
1312     // a header field with a fixed offset based on its alignment. Hence it
1313     // needs special handling and cannot be added to FrameData.Allocas.
1314     if (PromiseAlloca)
1315       FrameData.setFieldIndex(
1316           PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true));
1317 
1318     // Add a field to store the suspend index.  This doesn't need to
1319     // be in the header.
1320     unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
1321     Type *IndexType = Type::getIntNTy(C, IndexBits);
1322 
1323     SwitchIndexFieldId = B.addField(IndexType, std::nullopt);
1324   } else {
1325     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
1326   }
1327 
1328   // Because multiple allocas may own the same field slot,
1329   // we add allocas to field here.
1330   B.addFieldForAllocas(F, FrameData, Shape);
1331   // Add PromiseAlloca to Allocas list so that
1332   // 1. updateLayoutIndex could update its index after
1333   // `performOptimizedStructLayout`
1334   // 2. it is processed in insertSpills.
1335   if (Shape.ABI == coro::ABI::Switch && PromiseAlloca)
1336     // We assume that the promise alloca won't be modified before
1337     // CoroBegin and no alias will be create before CoroBegin.
1338     FrameData.Allocas.emplace_back(
1339         PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false);
1340   // Create an entry for every spilled value.
1341   for (auto &S : FrameData.Spills) {
1342     Type *FieldType = S.first->getType();
1343     // For byval arguments, we need to store the pointed value in the frame,
1344     // instead of the pointer itself.
1345     if (const Argument *A = dyn_cast<Argument>(S.first))
1346       if (A->hasByValAttr())
1347         FieldType = A->getParamByValType();
1348     FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/,
1349                                 true /*IsSpillOfValue*/);
1350     FrameData.setFieldIndex(S.first, Id);
1351   }
1352 
1353   B.finish(FrameTy);
1354   FrameData.updateLayoutIndex(B);
1355   Shape.FrameAlign = B.getStructAlign();
1356   Shape.FrameSize = B.getStructSize();
1357 
1358   switch (Shape.ABI) {
1359   case coro::ABI::Switch: {
1360     // In the switch ABI, remember the switch-index field.
1361     auto IndexField = B.getLayoutField(*SwitchIndexFieldId);
1362     Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex;
1363     Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value();
1364     Shape.SwitchLowering.IndexOffset = IndexField.Offset;
1365 
1366     // Also round the frame size up to a multiple of its alignment, as is
1367     // generally expected in C/C++.
1368     Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign);
1369     break;
1370   }
1371 
1372   // In the retcon ABI, remember whether the frame is inline in the storage.
1373   case coro::ABI::Retcon:
1374   case coro::ABI::RetconOnce: {
1375     auto Id = Shape.getRetconCoroId();
1376     Shape.RetconLowering.IsFrameInlineInStorage
1377       = (B.getStructSize() <= Id->getStorageSize() &&
1378          B.getStructAlign() <= Id->getStorageAlignment());
1379     break;
1380   }
1381   case coro::ABI::Async: {
1382     Shape.AsyncLowering.FrameOffset =
1383         alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign);
1384     // Also make the final context size a multiple of the context alignment to
1385     // make allocation easier for allocators.
1386     Shape.AsyncLowering.ContextSize =
1387         alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize,
1388                 Shape.AsyncLowering.getContextAlignment());
1389     if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) {
1390       report_fatal_error(
1391           "The alignment requirment of frame variables cannot be higher than "
1392           "the alignment of the async function context");
1393     }
1394     break;
1395   }
1396   }
1397 
1398   return FrameTy;
1399 }
1400 
1401 // We use a pointer use visitor to track how an alloca is being used.
1402 // The goal is to be able to answer the following three questions:
1403 // 1. Should this alloca be allocated on the frame instead.
1404 // 2. Could the content of the alloca be modified prior to CoroBegn, which would
1405 // require copying the data from alloca to the frame after CoroBegin.
1406 // 3. Is there any alias created for this alloca prior to CoroBegin, but used
1407 // after CoroBegin. In that case, we will need to recreate the alias after
1408 // CoroBegin based off the frame. To answer question 1, we track two things:
1409 //   a. List of all BasicBlocks that use this alloca or any of the aliases of
1410 //   the alloca. In the end, we check if there exists any two basic blocks that
1411 //   cross suspension points. If so, this alloca must be put on the frame. b.
1412 //   Whether the alloca or any alias of the alloca is escaped at some point,
1413 //   either by storing the address somewhere, or the address is used in a
1414 //   function call that might capture. If it's ever escaped, this alloca must be
1415 //   put on the frame conservatively.
1416 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin.
1417 // Whenever a potential write happens, either through a store instruction, a
1418 // function call or any of the memory intrinsics, we check whether this
1419 // instruction is prior to CoroBegin. To answer question 3, we track the offsets
1420 // of all aliases created for the alloca prior to CoroBegin but used after
1421 // CoroBegin. std::optional is used to be able to represent the case when the
1422 // offset is unknown (e.g. when you have a PHINode that takes in different
1423 // offset values). We cannot handle unknown offsets and will assert. This is the
1424 // potential issue left out. An ideal solution would likely require a
1425 // significant redesign.
1426 namespace {
1427 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
1428   using Base = PtrUseVisitor<AllocaUseVisitor>;
1429   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
1430                    const coro::Shape &CoroShape,
1431                    const SuspendCrossingInfo &Checker,
1432                    bool ShouldUseLifetimeStartInfo)
1433       : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker),
1434         ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {
1435     for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends)
1436       CoroSuspendBBs.insert(SuspendInst->getParent());
1437   }
1438 
1439   void visit(Instruction &I) {
1440     Users.insert(&I);
1441     Base::visit(I);
1442     // If the pointer is escaped prior to CoroBegin, we have to assume it would
1443     // be written into before CoroBegin as well.
1444     if (PI.isEscaped() &&
1445         !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) {
1446       MayWriteBeforeCoroBegin = true;
1447     }
1448   }
1449   // We need to provide this overload as PtrUseVisitor uses a pointer based
1450   // visiting function.
1451   void visit(Instruction *I) { return visit(*I); }
1452 
1453   void visitPHINode(PHINode &I) {
1454     enqueueUsers(I);
1455     handleAlias(I);
1456   }
1457 
1458   void visitSelectInst(SelectInst &I) {
1459     enqueueUsers(I);
1460     handleAlias(I);
1461   }
1462 
1463   void visitStoreInst(StoreInst &SI) {
1464     // Regardless whether the alias of the alloca is the value operand or the
1465     // pointer operand, we need to assume the alloca is been written.
1466     handleMayWrite(SI);
1467 
1468     if (SI.getValueOperand() != U->get())
1469       return;
1470 
1471     // We are storing the pointer into a memory location, potentially escaping.
1472     // As an optimization, we try to detect simple cases where it doesn't
1473     // actually escape, for example:
1474     //   %ptr = alloca ..
1475     //   %addr = alloca ..
1476     //   store %ptr, %addr
1477     //   %x = load %addr
1478     //   ..
1479     // If %addr is only used by loading from it, we could simply treat %x as
1480     // another alias of %ptr, and not considering %ptr being escaped.
1481     auto IsSimpleStoreThenLoad = [&]() {
1482       auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand());
1483       // If the memory location we are storing to is not an alloca, it
1484       // could be an alias of some other memory locations, which is difficult
1485       // to analyze.
1486       if (!AI)
1487         return false;
1488       // StoreAliases contains aliases of the memory location stored into.
1489       SmallVector<Instruction *, 4> StoreAliases = {AI};
1490       while (!StoreAliases.empty()) {
1491         Instruction *I = StoreAliases.pop_back_val();
1492         for (User *U : I->users()) {
1493           // If we are loading from the memory location, we are creating an
1494           // alias of the original pointer.
1495           if (auto *LI = dyn_cast<LoadInst>(U)) {
1496             enqueueUsers(*LI);
1497             handleAlias(*LI);
1498             continue;
1499           }
1500           // If we are overriding the memory location, the pointer certainly
1501           // won't escape.
1502           if (auto *S = dyn_cast<StoreInst>(U))
1503             if (S->getPointerOperand() == I)
1504               continue;
1505           if (auto *II = dyn_cast<IntrinsicInst>(U))
1506             if (II->isLifetimeStartOrEnd())
1507               continue;
1508           // BitCastInst creats aliases of the memory location being stored
1509           // into.
1510           if (auto *BI = dyn_cast<BitCastInst>(U)) {
1511             StoreAliases.push_back(BI);
1512             continue;
1513           }
1514           return false;
1515         }
1516       }
1517 
1518       return true;
1519     };
1520 
1521     if (!IsSimpleStoreThenLoad())
1522       PI.setEscaped(&SI);
1523   }
1524 
1525   // All mem intrinsics modify the data.
1526   void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); }
1527 
1528   void visitBitCastInst(BitCastInst &BC) {
1529     Base::visitBitCastInst(BC);
1530     handleAlias(BC);
1531   }
1532 
1533   void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1534     Base::visitAddrSpaceCastInst(ASC);
1535     handleAlias(ASC);
1536   }
1537 
1538   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1539     // The base visitor will adjust Offset accordingly.
1540     Base::visitGetElementPtrInst(GEPI);
1541     handleAlias(GEPI);
1542   }
1543 
1544   void visitIntrinsicInst(IntrinsicInst &II) {
1545     // When we found the lifetime markers refers to a
1546     // subrange of the original alloca, ignore the lifetime
1547     // markers to avoid misleading the analysis.
1548     if (!IsOffsetKnown || !Offset.isZero())
1549       return Base::visitIntrinsicInst(II);
1550     switch (II.getIntrinsicID()) {
1551     default:
1552       return Base::visitIntrinsicInst(II);
1553     case Intrinsic::lifetime_start:
1554       LifetimeStarts.insert(&II);
1555       LifetimeStartBBs.push_back(II.getParent());
1556       break;
1557     case Intrinsic::lifetime_end:
1558       LifetimeEndBBs.insert(II.getParent());
1559       break;
1560     }
1561   }
1562 
1563   void visitCallBase(CallBase &CB) {
1564     for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op)
1565       if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op))
1566         PI.setEscaped(&CB);
1567     handleMayWrite(CB);
1568   }
1569 
1570   bool getShouldLiveOnFrame() const {
1571     if (!ShouldLiveOnFrame)
1572       ShouldLiveOnFrame = computeShouldLiveOnFrame();
1573     return *ShouldLiveOnFrame;
1574   }
1575 
1576   bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; }
1577 
1578   DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const {
1579     assert(getShouldLiveOnFrame() && "This method should only be called if the "
1580                                      "alloca needs to live on the frame.");
1581     for (const auto &P : AliasOffetMap)
1582       if (!P.second)
1583         report_fatal_error("Unable to handle an alias with unknown offset "
1584                            "created before CoroBegin.");
1585     return AliasOffetMap;
1586   }
1587 
1588 private:
1589   const DominatorTree &DT;
1590   const coro::Shape &CoroShape;
1591   const SuspendCrossingInfo &Checker;
1592   // All alias to the original AllocaInst, created before CoroBegin and used
1593   // after CoroBegin. Each entry contains the instruction and the offset in the
1594   // original Alloca. They need to be recreated after CoroBegin off the frame.
1595   DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{};
1596   SmallPtrSet<Instruction *, 4> Users{};
1597   SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
1598   SmallVector<BasicBlock *> LifetimeStartBBs{};
1599   SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{};
1600   SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{};
1601   bool MayWriteBeforeCoroBegin{false};
1602   bool ShouldUseLifetimeStartInfo{true};
1603 
1604   mutable std::optional<bool> ShouldLiveOnFrame{};
1605 
1606   bool computeShouldLiveOnFrame() const {
1607     // If lifetime information is available, we check it first since it's
1608     // more precise. We look at every pair of lifetime.start intrinsic and
1609     // every basic block that uses the pointer to see if they cross suspension
1610     // points. The uses cover both direct uses as well as indirect uses.
1611     if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
1612       // If there is no explicit lifetime.end, then assume the address can
1613       // cross suspension points.
1614       if (LifetimeEndBBs.empty())
1615         return true;
1616 
1617       // If there is a path from a lifetime.start to a suspend without a
1618       // corresponding lifetime.end, then the alloca's lifetime persists
1619       // beyond that suspension point and the alloca must go on the frame.
1620       llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs);
1621       if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs,
1622                                              &LifetimeEndBBs, &DT))
1623         return true;
1624 
1625       // Addresses are guaranteed to be identical after every lifetime.start so
1626       // we cannot use the local stack if the address escaped and there is a
1627       // suspend point between lifetime markers. This should also cover the
1628       // case of a single lifetime.start intrinsic in a loop with suspend point.
1629       if (PI.isEscaped()) {
1630         for (auto *A : LifetimeStarts) {
1631           for (auto *B : LifetimeStarts) {
1632             if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(),
1633                                                           B->getParent()))
1634               return true;
1635           }
1636         }
1637       }
1638       return false;
1639     }
1640     // FIXME: Ideally the isEscaped check should come at the beginning.
1641     // However there are a few loose ends that need to be fixed first before
1642     // we can do that. We need to make sure we are not over-conservative, so
1643     // that the data accessed in-between await_suspend and symmetric transfer
1644     // is always put on the stack, and also data accessed after coro.end is
1645     // always put on the stack (esp the return object). To fix that, we need
1646     // to:
1647     //  1) Potentially treat sret as nocapture in calls
1648     //  2) Special handle the return object and put it on the stack
1649     //  3) Utilize lifetime.end intrinsic
1650     if (PI.isEscaped())
1651       return true;
1652 
1653     for (auto *U1 : Users)
1654       for (auto *U2 : Users)
1655         if (Checker.isDefinitionAcrossSuspend(*U1, U2))
1656           return true;
1657 
1658     return false;
1659   }
1660 
1661   void handleMayWrite(const Instruction &I) {
1662     if (!DT.dominates(CoroShape.CoroBegin, &I))
1663       MayWriteBeforeCoroBegin = true;
1664   }
1665 
1666   bool usedAfterCoroBegin(Instruction &I) {
1667     for (auto &U : I.uses())
1668       if (DT.dominates(CoroShape.CoroBegin, U))
1669         return true;
1670     return false;
1671   }
1672 
1673   void handleAlias(Instruction &I) {
1674     // We track all aliases created prior to CoroBegin but used after.
1675     // These aliases may need to be recreated after CoroBegin if the alloca
1676     // need to live on the frame.
1677     if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I))
1678       return;
1679 
1680     if (!IsOffsetKnown) {
1681       AliasOffetMap[&I].reset();
1682     } else {
1683       auto Itr = AliasOffetMap.find(&I);
1684       if (Itr == AliasOffetMap.end()) {
1685         AliasOffetMap[&I] = Offset;
1686       } else if (Itr->second && *Itr->second != Offset) {
1687         // If we have seen two different possible values for this alias, we set
1688         // it to empty.
1689         AliasOffetMap[&I].reset();
1690       }
1691     }
1692   }
1693 };
1694 } // namespace
1695 
1696 // We need to make room to insert a spill after initial PHIs, but before
1697 // catchswitch instruction. Placing it before violates the requirement that
1698 // catchswitch, like all other EHPads must be the first nonPHI in a block.
1699 //
1700 // Split away catchswitch into a separate block and insert in its place:
1701 //
1702 //   cleanuppad <InsertPt> cleanupret.
1703 //
1704 // cleanupret instruction will act as an insert point for the spill.
1705 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
1706   BasicBlock *CurrentBlock = CatchSwitch->getParent();
1707   BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch);
1708   CurrentBlock->getTerminator()->eraseFromParent();
1709 
1710   auto *CleanupPad =
1711       CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock);
1712   auto *CleanupRet =
1713       CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock);
1714   return CleanupRet;
1715 }
1716 
1717 // Replace all alloca and SSA values that are accessed across suspend points
1718 // with GetElementPointer from coroutine frame + loads and stores. Create an
1719 // AllocaSpillBB that will become the new entry block for the resume parts of
1720 // the coroutine:
1721 //
1722 //    %hdl = coro.begin(...)
1723 //    whatever
1724 //
1725 // becomes:
1726 //
1727 //    %hdl = coro.begin(...)
1728 //    br label %AllocaSpillBB
1729 //
1730 //  AllocaSpillBB:
1731 //    ; geps corresponding to allocas that were moved to coroutine frame
1732 //    br label PostSpill
1733 //
1734 //  PostSpill:
1735 //    whatever
1736 //
1737 //
1738 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
1739   auto *CB = Shape.CoroBegin;
1740   LLVMContext &C = CB->getContext();
1741   Function *F = CB->getFunction();
1742   IRBuilder<> Builder(C);
1743   StructType *FrameTy = Shape.FrameTy;
1744   Value *FramePtr = Shape.FramePtr;
1745   DominatorTree DT(*F);
1746   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
1747 
1748   // Create a GEP with the given index into the coroutine frame for the original
1749   // value Orig. Appends an extra 0 index for array-allocas, preserving the
1750   // original type.
1751   auto GetFramePointer = [&](Value *Orig) -> Value * {
1752     FieldIDType Index = FrameData.getFieldIndex(Orig);
1753     SmallVector<Value *, 3> Indices = {
1754         ConstantInt::get(Type::getInt32Ty(C), 0),
1755         ConstantInt::get(Type::getInt32Ty(C), Index),
1756     };
1757 
1758     if (auto *AI = dyn_cast<AllocaInst>(Orig)) {
1759       if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
1760         auto Count = CI->getValue().getZExtValue();
1761         if (Count > 1) {
1762           Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0));
1763         }
1764       } else {
1765         report_fatal_error("Coroutines cannot handle non static allocas yet");
1766       }
1767     }
1768 
1769     auto GEP = cast<GetElementPtrInst>(
1770         Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices));
1771     if (auto *AI = dyn_cast<AllocaInst>(Orig)) {
1772       if (FrameData.getDynamicAlign(Orig) != 0) {
1773         assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value());
1774         auto *M = AI->getModule();
1775         auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType());
1776         auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy);
1777         auto *AlignMask =
1778             ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1);
1779         PtrValue = Builder.CreateAdd(PtrValue, AlignMask);
1780         PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask));
1781         return Builder.CreateIntToPtr(PtrValue, AI->getType());
1782       }
1783       // If the type of GEP is not equal to the type of AllocaInst, it implies
1784       // that the AllocaInst may be reused in the Frame slot of other
1785       // AllocaInst. So We cast GEP to the AllocaInst here to re-use
1786       // the Frame storage.
1787       //
1788       // Note: If we change the strategy dealing with alignment, we need to refine
1789       // this casting.
1790       if (GEP->getType() != Orig->getType())
1791         return Builder.CreateAddrSpaceCast(GEP, Orig->getType(),
1792                                            Orig->getName() + Twine(".cast"));
1793     }
1794     return GEP;
1795   };
1796 
1797   for (auto const &E : FrameData.Spills) {
1798     Value *Def = E.first;
1799     auto SpillAlignment = Align(FrameData.getAlign(Def));
1800     // Create a store instruction storing the value into the
1801     // coroutine frame.
1802     BasicBlock::iterator InsertPt;
1803     Type *ByValTy = nullptr;
1804     if (auto *Arg = dyn_cast<Argument>(Def)) {
1805       // For arguments, we will place the store instruction right after
1806       // the coroutine frame pointer instruction, i.e. coro.begin.
1807       InsertPt = Shape.getInsertPtAfterFramePtr();
1808 
1809       // If we're spilling an Argument, make sure we clear 'nocapture'
1810       // from the coroutine function.
1811       Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
1812 
1813       if (Arg->hasByValAttr())
1814         ByValTy = Arg->getParamByValType();
1815     } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
1816       // Don't spill immediately after a suspend; splitting assumes
1817       // that the suspend will be followed by a branch.
1818       InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt();
1819     } else {
1820       auto *I = cast<Instruction>(Def);
1821       if (!DT.dominates(CB, I)) {
1822         // If it is not dominated by CoroBegin, then spill should be
1823         // inserted immediately after CoroFrame is computed.
1824         InsertPt = Shape.getInsertPtAfterFramePtr();
1825       } else if (auto *II = dyn_cast<InvokeInst>(I)) {
1826         // If we are spilling the result of the invoke instruction, split
1827         // the normal edge and insert the spill in the new block.
1828         auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest());
1829         InsertPt = NewBB->getTerminator()->getIterator();
1830       } else if (isa<PHINode>(I)) {
1831         // Skip the PHINodes and EH pads instructions.
1832         BasicBlock *DefBlock = I->getParent();
1833         if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
1834           InsertPt = splitBeforeCatchSwitch(CSI)->getIterator();
1835         else
1836           InsertPt = DefBlock->getFirstInsertionPt();
1837       } else {
1838         assert(!I->isTerminator() && "unexpected terminator");
1839         // For all other values, the spill is placed immediately after
1840         // the definition.
1841         InsertPt = I->getNextNode()->getIterator();
1842       }
1843     }
1844 
1845     auto Index = FrameData.getFieldIndex(Def);
1846     Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
1847     auto *G = Builder.CreateConstInBoundsGEP2_32(
1848         FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
1849     if (ByValTy) {
1850       // For byval arguments, we need to store the pointed value in the frame,
1851       // instead of the pointer itself.
1852       auto *Value = Builder.CreateLoad(ByValTy, Def);
1853       Builder.CreateAlignedStore(Value, G, SpillAlignment);
1854     } else {
1855       Builder.CreateAlignedStore(Def, G, SpillAlignment);
1856     }
1857 
1858     BasicBlock *CurrentBlock = nullptr;
1859     Value *CurrentReload = nullptr;
1860     for (auto *U : E.second) {
1861       // If we have not seen the use block, create a load instruction to reload
1862       // the spilled value from the coroutine frame. Populates the Value pointer
1863       // reference provided with the frame GEP.
1864       if (CurrentBlock != U->getParent()) {
1865         CurrentBlock = U->getParent();
1866         Builder.SetInsertPoint(CurrentBlock,
1867                                CurrentBlock->getFirstInsertionPt());
1868 
1869         auto *GEP = GetFramePointer(E.first);
1870         GEP->setName(E.first->getName() + Twine(".reload.addr"));
1871         if (ByValTy)
1872           CurrentReload = GEP;
1873         else
1874           CurrentReload = Builder.CreateAlignedLoad(
1875               FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
1876               SpillAlignment, E.first->getName() + Twine(".reload"));
1877 
1878         TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def);
1879         TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def);
1880         // Try best to find dbg.declare. If the spill is a temp, there may not
1881         // be a direct dbg.declare. Walk up the load chain to find one from an
1882         // alias.
1883         if (F->getSubprogram()) {
1884           auto *CurDef = Def;
1885           while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) {
1886             auto *LdInst = cast<LoadInst>(CurDef);
1887             // Only consider ptr to ptr same type load.
1888             if (LdInst->getPointerOperandType() != LdInst->getType())
1889               break;
1890             CurDef = LdInst->getPointerOperand();
1891             if (!isa<AllocaInst, LoadInst>(CurDef))
1892               break;
1893             DIs = findDbgDeclares(CurDef);
1894             DVRs = findDVRDeclares(CurDef);
1895           }
1896         }
1897 
1898         auto SalvageOne = [&](auto *DDI) {
1899           bool AllowUnresolved = false;
1900           // This dbg.declare is preserved for all coro-split function
1901           // fragments. It will be unreachable in the main function, and
1902           // processed by coro::salvageDebugInfo() by CoroCloner.
1903           if (UseNewDbgInfoFormat) {
1904             DbgVariableRecord *NewDVR = new DbgVariableRecord(
1905                 ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
1906                 DDI->getExpression(), DDI->getDebugLoc(),
1907                 DbgVariableRecord::LocationType::Declare);
1908             Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
1909                 NewDVR, Builder.GetInsertPoint());
1910           } else {
1911             DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
1912                 .insertDeclare(CurrentReload, DDI->getVariable(),
1913                                DDI->getExpression(), DDI->getDebugLoc(),
1914                                &*Builder.GetInsertPoint());
1915           }
1916           // This dbg.declare is for the main function entry point.  It
1917           // will be deleted in all coro-split functions.
1918           coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame,
1919                                  false /*UseEntryValue*/);
1920         };
1921         for_each(DIs, SalvageOne);
1922         for_each(DVRs, SalvageOne);
1923       }
1924 
1925       // If we have a single edge PHINode, remove it and replace it with a
1926       // reload from the coroutine frame. (We already took care of multi edge
1927       // PHINodes by rewriting them in the rewritePHIs function).
1928       if (auto *PN = dyn_cast<PHINode>(U)) {
1929         assert(PN->getNumIncomingValues() == 1 &&
1930                "unexpected number of incoming "
1931                "values in the PHINode");
1932         PN->replaceAllUsesWith(CurrentReload);
1933         PN->eraseFromParent();
1934         continue;
1935       }
1936 
1937       // Replace all uses of CurrentValue in the current instruction with
1938       // reload.
1939       U->replaceUsesOfWith(Def, CurrentReload);
1940       // Instructions are added to Def's user list if the attached
1941       // debug records use Def. Update those now.
1942       for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange()))
1943         DVR.replaceVariableLocationOp(Def, CurrentReload, true);
1944     }
1945   }
1946 
1947   BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent();
1948 
1949   auto SpillBlock = FramePtrBB->splitBasicBlock(
1950       Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB");
1951   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
1952   Shape.AllocaSpillBlock = SpillBlock;
1953 
1954   // retcon and retcon.once lowering assumes all uses have been sunk.
1955   if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
1956       Shape.ABI == coro::ABI::Async) {
1957     // If we found any allocas, replace all of their remaining uses with Geps.
1958     Builder.SetInsertPoint(SpillBlock, SpillBlock->begin());
1959     for (const auto &P : FrameData.Allocas) {
1960       AllocaInst *Alloca = P.Alloca;
1961       auto *G = GetFramePointer(Alloca);
1962 
1963       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
1964       // here, as we are changing location of the instruction.
1965       G->takeName(Alloca);
1966       Alloca->replaceAllUsesWith(G);
1967       Alloca->eraseFromParent();
1968     }
1969     return;
1970   }
1971 
1972   // If we found any alloca, replace all of their remaining uses with GEP
1973   // instructions. To remain debugbility, we replace the uses of allocas for
1974   // dbg.declares and dbg.values with the reload from the frame.
1975   // Note: We cannot replace the alloca with GEP instructions indiscriminately,
1976   // as some of the uses may not be dominated by CoroBegin.
1977   Builder.SetInsertPoint(Shape.AllocaSpillBlock,
1978                          Shape.AllocaSpillBlock->begin());
1979   SmallVector<Instruction *, 4> UsersToUpdate;
1980   for (const auto &A : FrameData.Allocas) {
1981     AllocaInst *Alloca = A.Alloca;
1982     UsersToUpdate.clear();
1983     for (User *U : Alloca->users()) {
1984       auto *I = cast<Instruction>(U);
1985       if (DT.dominates(CB, I))
1986         UsersToUpdate.push_back(I);
1987     }
1988     if (UsersToUpdate.empty())
1989       continue;
1990     auto *G = GetFramePointer(Alloca);
1991     G->setName(Alloca->getName() + Twine(".reload.addr"));
1992 
1993     SmallVector<DbgVariableIntrinsic *, 4> DIs;
1994     SmallVector<DbgVariableRecord *> DbgVariableRecords;
1995     findDbgUsers(DIs, Alloca, &DbgVariableRecords);
1996     for (auto *DVI : DIs)
1997       DVI->replaceUsesOfWith(Alloca, G);
1998     for (auto *DVR : DbgVariableRecords)
1999       DVR->replaceVariableLocationOp(Alloca, G);
2000 
2001     for (Instruction *I : UsersToUpdate) {
2002       // It is meaningless to retain the lifetime intrinsics refer for the
2003       // member of coroutine frames and the meaningless lifetime intrinsics
2004       // are possible to block further optimizations.
2005       if (I->isLifetimeStartOrEnd()) {
2006         I->eraseFromParent();
2007         continue;
2008       }
2009 
2010       I->replaceUsesOfWith(Alloca, G);
2011     }
2012   }
2013   Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr());
2014   for (const auto &A : FrameData.Allocas) {
2015     AllocaInst *Alloca = A.Alloca;
2016     if (A.MayWriteBeforeCoroBegin) {
2017       // isEscaped really means potentially modified before CoroBegin.
2018       if (Alloca->isArrayAllocation())
2019         report_fatal_error(
2020             "Coroutines cannot handle copying of array allocas yet");
2021 
2022       auto *G = GetFramePointer(Alloca);
2023       auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca);
2024       Builder.CreateStore(Value, G);
2025     }
2026     // For each alias to Alloca created before CoroBegin but used after
2027     // CoroBegin, we recreate them after CoroBegin by appplying the offset
2028     // to the pointer in the frame.
2029     for (const auto &Alias : A.Aliases) {
2030       auto *FramePtr = GetFramePointer(Alloca);
2031       auto &Value = *Alias.second;
2032       auto ITy = IntegerType::get(C, Value.getBitWidth());
2033       auto *AliasPtr =
2034           Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value));
2035       Alias.first->replaceUsesWithIf(
2036           AliasPtr, [&](Use &U) { return DT.dominates(CB, U); });
2037     }
2038   }
2039 
2040   // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle
2041   // the case that the PromiseAlloca may have writes before CoroBegin in the
2042   // above codes. And it may be problematic in edge cases. See
2043   // https://github.com/llvm/llvm-project/issues/57861 for an example.
2044   if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) {
2045     AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca;
2046     // If there is memory accessing to promise alloca before CoroBegin;
2047     bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) {
2048       auto *Inst = dyn_cast<Instruction>(U.getUser());
2049       if (!Inst || DT.dominates(CB, Inst))
2050         return false;
2051 
2052       if (auto *CI = dyn_cast<CallInst>(Inst)) {
2053         // It is fine if the call wouldn't write to the Promise.
2054         // This is possible for @llvm.coro.id intrinsics, which
2055         // would take the promise as the second argument as a
2056         // marker.
2057         if (CI->onlyReadsMemory() ||
2058             CI->onlyReadsMemory(CI->getArgOperandNo(&U)))
2059           return false;
2060         return true;
2061       }
2062 
2063       return isa<StoreInst>(Inst) ||
2064              // It may take too much time to track the uses.
2065              // Be conservative about the case the use may escape.
2066              isa<GetElementPtrInst>(Inst) ||
2067              // There would always be a bitcast for the promise alloca
2068              // before we enabled Opaque pointers. And now given
2069              // opaque pointers are enabled by default. This should be
2070              // fine.
2071              isa<BitCastInst>(Inst);
2072     });
2073     if (HasAccessingPromiseBeforeCB) {
2074       Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr());
2075       auto *G = GetFramePointer(PA);
2076       auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA);
2077       Builder.CreateStore(Value, G);
2078     }
2079   }
2080 }
2081 
2082 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new
2083 // PHI in InsertedBB.
2084 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB,
2085                                          BasicBlock *InsertedBB,
2086                                          BasicBlock *PredBB,
2087                                          PHINode *UntilPHI = nullptr) {
2088   auto *PN = cast<PHINode>(&SuccBB->front());
2089   do {
2090     int Index = PN->getBasicBlockIndex(InsertedBB);
2091     Value *V = PN->getIncomingValue(Index);
2092     PHINode *InputV = PHINode::Create(
2093         V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName());
2094     InputV->insertBefore(InsertedBB->begin());
2095     InputV->addIncoming(V, PredBB);
2096     PN->setIncomingValue(Index, InputV);
2097     PN = dyn_cast<PHINode>(PN->getNextNode());
2098   } while (PN != UntilPHI);
2099 }
2100 
2101 // Rewrites the PHI Nodes in a cleanuppad.
2102 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB,
2103                                      CleanupPadInst *CleanupPad) {
2104   // For every incoming edge to a CleanupPad we will create a new block holding
2105   // all incoming values in single-value PHI nodes. We will then create another
2106   // block to act as a dispather (as all unwind edges for related EH blocks
2107   // must be the same).
2108   //
2109   // cleanuppad:
2110   //    %2 = phi i32[%0, %catchswitch], [%1, %catch.1]
2111   //    %3 = cleanuppad within none []
2112   //
2113   // It will create:
2114   //
2115   // cleanuppad.corodispatch
2116   //    %2 = phi i8[0, %catchswitch], [1, %catch.1]
2117   //    %3 = cleanuppad within none []
2118   //    switch i8 % 2, label %unreachable
2119   //            [i8 0, label %cleanuppad.from.catchswitch
2120   //             i8 1, label %cleanuppad.from.catch.1]
2121   // cleanuppad.from.catchswitch:
2122   //    %4 = phi i32 [%0, %catchswitch]
2123   //    br %label cleanuppad
2124   // cleanuppad.from.catch.1:
2125   //    %6 = phi i32 [%1, %catch.1]
2126   //    br %label cleanuppad
2127   // cleanuppad:
2128   //    %8 = phi i32 [%4, %cleanuppad.from.catchswitch],
2129   //                 [%6, %cleanuppad.from.catch.1]
2130 
2131   // Unreachable BB, in case switching on an invalid value in the dispatcher.
2132   auto *UnreachBB = BasicBlock::Create(
2133       CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent());
2134   IRBuilder<> Builder(UnreachBB);
2135   Builder.CreateUnreachable();
2136 
2137   // Create a new cleanuppad which will be the dispatcher.
2138   auto *NewCleanupPadBB =
2139       BasicBlock::Create(CleanupPadBB->getContext(),
2140                          CleanupPadBB->getName() + Twine(".corodispatch"),
2141                          CleanupPadBB->getParent(), CleanupPadBB);
2142   Builder.SetInsertPoint(NewCleanupPadBB);
2143   auto *SwitchType = Builder.getInt8Ty();
2144   auto *SetDispatchValuePN =
2145       Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB));
2146   CleanupPad->removeFromParent();
2147   CleanupPad->insertAfter(SetDispatchValuePN);
2148   auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB,
2149                                                 pred_size(CleanupPadBB));
2150 
2151   int SwitchIndex = 0;
2152   SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB));
2153   for (BasicBlock *Pred : Preds) {
2154     // Create a new cleanuppad and move the PHI values to there.
2155     auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(),
2156                                       CleanupPadBB->getName() +
2157                                           Twine(".from.") + Pred->getName(),
2158                                       CleanupPadBB->getParent(), CleanupPadBB);
2159     updatePhiNodes(CleanupPadBB, Pred, CaseBB);
2160     CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") +
2161                     Pred->getName());
2162     Builder.SetInsertPoint(CaseBB);
2163     Builder.CreateBr(CleanupPadBB);
2164     movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB);
2165 
2166     // Update this Pred to the new unwind point.
2167     setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB);
2168 
2169     // Setup the switch in the dispatcher.
2170     auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex);
2171     SetDispatchValuePN->addIncoming(SwitchConstant, Pred);
2172     SwitchOnDispatch->addCase(SwitchConstant, CaseBB);
2173     SwitchIndex++;
2174   }
2175 }
2176 
2177 static void cleanupSinglePredPHIs(Function &F) {
2178   SmallVector<PHINode *, 32> Worklist;
2179   for (auto &BB : F) {
2180     for (auto &Phi : BB.phis()) {
2181       if (Phi.getNumIncomingValues() == 1) {
2182         Worklist.push_back(&Phi);
2183       } else
2184         break;
2185     }
2186   }
2187   while (!Worklist.empty()) {
2188     auto *Phi = Worklist.pop_back_val();
2189     auto *OriginalValue = Phi->getIncomingValue(0);
2190     Phi->replaceAllUsesWith(OriginalValue);
2191   }
2192 }
2193 
2194 static void rewritePHIs(BasicBlock &BB) {
2195   // For every incoming edge we will create a block holding all
2196   // incoming values in a single PHI nodes.
2197   //
2198   // loop:
2199   //    %n.val = phi i32[%n, %entry], [%inc, %loop]
2200   //
2201   // It will create:
2202   //
2203   // loop.from.entry:
2204   //    %n.loop.pre = phi i32 [%n, %entry]
2205   //    br %label loop
2206   // loop.from.loop:
2207   //    %inc.loop.pre = phi i32 [%inc, %loop]
2208   //    br %label loop
2209   //
2210   // After this rewrite, further analysis will ignore any phi nodes with more
2211   // than one incoming edge.
2212 
2213   // TODO: Simplify PHINodes in the basic block to remove duplicate
2214   // predecessors.
2215 
2216   // Special case for CleanupPad: all EH blocks must have the same unwind edge
2217   // so we need to create an additional "dispatcher" block.
2218   if (auto *CleanupPad =
2219           dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) {
2220     SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
2221     for (BasicBlock *Pred : Preds) {
2222       if (CatchSwitchInst *CS =
2223               dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
2224         // CleanupPad with a CatchSwitch predecessor: therefore this is an
2225         // unwind destination that needs to be handle specially.
2226         assert(CS->getUnwindDest() == &BB);
2227         (void)CS;
2228         rewritePHIsForCleanupPad(&BB, CleanupPad);
2229         return;
2230       }
2231     }
2232   }
2233 
2234   LandingPadInst *LandingPad = nullptr;
2235   PHINode *ReplPHI = nullptr;
2236   if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
2237     // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
2238     // We replace the original landing pad with a PHINode that will collect the
2239     // results from all of them.
2240     ReplPHI = PHINode::Create(LandingPad->getType(), 1, "");
2241     ReplPHI->insertBefore(LandingPad->getIterator());
2242     ReplPHI->takeName(LandingPad);
2243     LandingPad->replaceAllUsesWith(ReplPHI);
2244     // We will erase the original landing pad at the end of this function after
2245     // ehAwareSplitEdge cloned it in the transition blocks.
2246   }
2247 
2248   SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
2249   for (BasicBlock *Pred : Preds) {
2250     auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
2251     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
2252 
2253     // Stop the moving of values at ReplPHI, as this is either null or the PHI
2254     // that replaced the landing pad.
2255     movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI);
2256   }
2257 
2258   if (LandingPad) {
2259     // Calls to ehAwareSplitEdge function cloned the original lading pad.
2260     // No longer need it.
2261     LandingPad->eraseFromParent();
2262   }
2263 }
2264 
2265 static void rewritePHIs(Function &F) {
2266   SmallVector<BasicBlock *, 8> WorkList;
2267 
2268   for (BasicBlock &BB : F)
2269     if (auto *PN = dyn_cast<PHINode>(&BB.front()))
2270       if (PN->getNumIncomingValues() > 1)
2271         WorkList.push_back(&BB);
2272 
2273   for (BasicBlock *BB : WorkList)
2274     rewritePHIs(*BB);
2275 }
2276 
2277 /// Default materializable callback
2278 // Check for instructions that we can recreate on resume as opposed to spill
2279 // the result into a coroutine frame.
2280 bool coro::defaultMaterializable(Instruction &V) {
2281   return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
2282           isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
2283 }
2284 
2285 // Check for structural coroutine intrinsics that should not be spilled into
2286 // the coroutine frame.
2287 static bool isCoroutineStructureIntrinsic(Instruction &I) {
2288   return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
2289          isa<CoroSuspendInst>(&I);
2290 }
2291 
2292 // For each instruction identified as materializable across the suspend point,
2293 // and its associated DAG of other rematerializable instructions,
2294 // recreate the DAG of instructions after the suspend point.
2295 static void rewriteMaterializableInstructions(
2296     const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
2297         &AllRemats) {
2298   // This has to be done in 2 phases
2299   // Do the remats and record the required defs to be replaced in the
2300   // original use instructions
2301   // Once all the remats are complete, replace the uses in the final
2302   // instructions with the new defs
2303   typedef struct {
2304     Instruction *Use;
2305     Instruction *Def;
2306     Instruction *Remat;
2307   } ProcessNode;
2308 
2309   SmallVector<ProcessNode> FinalInstructionsToProcess;
2310 
2311   for (const auto &E : AllRemats) {
2312     Instruction *Use = E.first;
2313     Instruction *CurrentMaterialization = nullptr;
2314     RematGraph *RG = E.second.get();
2315     ReversePostOrderTraversal<RematGraph *> RPOT(RG);
2316     SmallVector<Instruction *> InstructionsToProcess;
2317 
2318     // If the target use is actually a suspend instruction then we have to
2319     // insert the remats into the end of the predecessor (there should only be
2320     // one). This is so that suspend blocks always have the suspend instruction
2321     // as the first instruction.
2322     auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
2323     if (isa<AnyCoroSuspendInst>(Use)) {
2324       BasicBlock *SuspendPredecessorBlock =
2325           Use->getParent()->getSinglePredecessor();
2326       assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
2327       InsertPoint = SuspendPredecessorBlock->getTerminator();
2328     }
2329 
2330     // Note: skip the first instruction as this is the actual use that we're
2331     // rematerializing everything for.
2332     auto I = RPOT.begin();
2333     ++I;
2334     for (; I != RPOT.end(); ++I) {
2335       Instruction *D = (*I)->Node;
2336       CurrentMaterialization = D->clone();
2337       CurrentMaterialization->setName(D->getName());
2338       CurrentMaterialization->insertBefore(InsertPoint);
2339       InsertPoint = CurrentMaterialization;
2340 
2341       // Replace all uses of Def in the instructions being added as part of this
2342       // rematerialization group
2343       for (auto &I : InstructionsToProcess)
2344         I->replaceUsesOfWith(D, CurrentMaterialization);
2345 
2346       // Don't replace the final use at this point as this can cause problems
2347       // for other materializations. Instead, for any final use that uses a
2348       // define that's being rematerialized, record the replace values
2349       for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
2350         if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
2351           FinalInstructionsToProcess.push_back(
2352               {Use, D, CurrentMaterialization});
2353 
2354       InstructionsToProcess.push_back(CurrentMaterialization);
2355     }
2356   }
2357 
2358   // Finally, replace the uses with the defines that we've just rematerialized
2359   for (auto &R : FinalInstructionsToProcess) {
2360     if (auto *PN = dyn_cast<PHINode>(R.Use)) {
2361       assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
2362                                                 "values in the PHINode");
2363       PN->replaceAllUsesWith(R.Remat);
2364       PN->eraseFromParent();
2365       continue;
2366     }
2367     R.Use->replaceUsesOfWith(R.Def, R.Remat);
2368   }
2369 }
2370 
2371 // Splits the block at a particular instruction unless it is the first
2372 // instruction in the block with a single predecessor.
2373 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
2374   auto *BB = I->getParent();
2375   if (&BB->front() == I) {
2376     if (BB->getSinglePredecessor()) {
2377       BB->setName(Name);
2378       return BB;
2379     }
2380   }
2381   return BB->splitBasicBlock(I, Name);
2382 }
2383 
2384 // Split above and below a particular instruction so that it
2385 // will be all alone by itself in a block.
2386 static void splitAround(Instruction *I, const Twine &Name) {
2387   splitBlockIfNotFirst(I, Name);
2388   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
2389 }
2390 
2391 static bool isSuspendBlock(BasicBlock *BB) {
2392   return isa<AnyCoroSuspendInst>(BB->front());
2393 }
2394 
2395 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet;
2396 
2397 /// Does control flow starting at the given block ever reach a suspend
2398 /// instruction before reaching a block in VisitedOrFreeBBs?
2399 static bool isSuspendReachableFrom(BasicBlock *From,
2400                                    VisitedBlocksSet &VisitedOrFreeBBs) {
2401   // Eagerly try to add this block to the visited set.  If it's already
2402   // there, stop recursing; this path doesn't reach a suspend before
2403   // either looping or reaching a freeing block.
2404   if (!VisitedOrFreeBBs.insert(From).second)
2405     return false;
2406 
2407   // We assume that we'll already have split suspends into their own blocks.
2408   if (isSuspendBlock(From))
2409     return true;
2410 
2411   // Recurse on the successors.
2412   for (auto *Succ : successors(From)) {
2413     if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs))
2414       return true;
2415   }
2416 
2417   return false;
2418 }
2419 
2420 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a
2421 /// suspend point?
2422 static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
2423   // Seed the visited set with all the basic blocks containing a free
2424   // so that we won't pass them up.
2425   VisitedBlocksSet VisitedOrFreeBBs;
2426   for (auto *User : AI->users()) {
2427     if (auto FI = dyn_cast<CoroAllocaFreeInst>(User))
2428       VisitedOrFreeBBs.insert(FI->getParent());
2429   }
2430 
2431   return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs);
2432 }
2433 
2434 /// After we split the coroutine, will the given basic block be along
2435 /// an obvious exit path for the resumption function?
2436 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
2437                                               unsigned depth = 3) {
2438   // If we've bottomed out our depth count, stop searching and assume
2439   // that the path might loop back.
2440   if (depth == 0) return false;
2441 
2442   // If this is a suspend block, we're about to exit the resumption function.
2443   if (isSuspendBlock(BB)) return true;
2444 
2445   // Recurse into the successors.
2446   for (auto *Succ : successors(BB)) {
2447     if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1))
2448       return false;
2449   }
2450 
2451   // If none of the successors leads back in a loop, we're on an exit/abort.
2452   return true;
2453 }
2454 
2455 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) {
2456   // Look for a free that isn't sufficiently obviously followed by
2457   // either a suspend or a termination, i.e. something that will leave
2458   // the coro resumption frame.
2459   for (auto *U : AI->users()) {
2460     auto FI = dyn_cast<CoroAllocaFreeInst>(U);
2461     if (!FI) continue;
2462 
2463     if (!willLeaveFunctionImmediatelyAfter(FI->getParent()))
2464       return true;
2465   }
2466 
2467   // If we never found one, we don't need a stack save.
2468   return false;
2469 }
2470 
2471 /// Turn each of the given local allocas into a normal (dynamic) alloca
2472 /// instruction.
2473 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
2474                               SmallVectorImpl<Instruction*> &DeadInsts) {
2475   for (auto *AI : LocalAllocas) {
2476     IRBuilder<> Builder(AI);
2477 
2478     // Save the stack depth.  Try to avoid doing this if the stackrestore
2479     // is going to immediately precede a return or something.
2480     Value *StackSave = nullptr;
2481     if (localAllocaNeedsStackSave(AI))
2482       StackSave = Builder.CreateStackSave();
2483 
2484     // Allocate memory.
2485     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
2486     Alloca->setAlignment(AI->getAlignment());
2487 
2488     for (auto *U : AI->users()) {
2489       // Replace gets with the allocation.
2490       if (isa<CoroAllocaGetInst>(U)) {
2491         U->replaceAllUsesWith(Alloca);
2492 
2493       // Replace frees with stackrestores.  This is safe because
2494       // alloca.alloc is required to obey a stack discipline, although we
2495       // don't enforce that structurally.
2496       } else {
2497         auto FI = cast<CoroAllocaFreeInst>(U);
2498         if (StackSave) {
2499           Builder.SetInsertPoint(FI);
2500           Builder.CreateStackRestore(StackSave);
2501         }
2502       }
2503       DeadInsts.push_back(cast<Instruction>(U));
2504     }
2505 
2506     DeadInsts.push_back(AI);
2507   }
2508 }
2509 
2510 /// Turn the given coro.alloca.alloc call into a dynamic allocation.
2511 /// This happens during the all-instructions iteration, so it must not
2512 /// delete the call.
2513 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI,
2514                                         coro::Shape &Shape,
2515                                    SmallVectorImpl<Instruction*> &DeadInsts) {
2516   IRBuilder<> Builder(AI);
2517   auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
2518 
2519   for (User *U : AI->users()) {
2520     if (isa<CoroAllocaGetInst>(U)) {
2521       U->replaceAllUsesWith(Alloc);
2522     } else {
2523       auto FI = cast<CoroAllocaFreeInst>(U);
2524       Builder.SetInsertPoint(FI);
2525       Shape.emitDealloc(Builder, Alloc, nullptr);
2526     }
2527     DeadInsts.push_back(cast<Instruction>(U));
2528   }
2529 
2530   // Push this on last so that it gets deleted after all the others.
2531   DeadInsts.push_back(AI);
2532 
2533   // Return the new allocation value so that we can check for needed spills.
2534   return cast<Instruction>(Alloc);
2535 }
2536 
2537 /// Get the current swifterror value.
2538 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
2539                                      coro::Shape &Shape) {
2540   // Make a fake function pointer as a sort of intrinsic.
2541   auto FnTy = FunctionType::get(ValueTy, {}, false);
2542   auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
2543 
2544   auto Call = Builder.CreateCall(FnTy, Fn, {});
2545   Shape.SwiftErrorOps.push_back(Call);
2546 
2547   return Call;
2548 }
2549 
2550 /// Set the given value as the current swifterror value.
2551 ///
2552 /// Returns a slot that can be used as a swifterror slot.
2553 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
2554                                      coro::Shape &Shape) {
2555   // Make a fake function pointer as a sort of intrinsic.
2556   auto FnTy = FunctionType::get(Builder.getPtrTy(),
2557                                 {V->getType()}, false);
2558   auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
2559 
2560   auto Call = Builder.CreateCall(FnTy, Fn, { V });
2561   Shape.SwiftErrorOps.push_back(Call);
2562 
2563   return Call;
2564 }
2565 
2566 /// Set the swifterror value from the given alloca before a call,
2567 /// then put in back in the alloca afterwards.
2568 ///
2569 /// Returns an address that will stand in for the swifterror slot
2570 /// until splitting.
2571 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call,
2572                                                  AllocaInst *Alloca,
2573                                                  coro::Shape &Shape) {
2574   auto ValueTy = Alloca->getAllocatedType();
2575   IRBuilder<> Builder(Call);
2576 
2577   // Load the current value from the alloca and set it as the
2578   // swifterror value.
2579   auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca);
2580   auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape);
2581 
2582   // Move to after the call.  Since swifterror only has a guaranteed
2583   // value on normal exits, we can ignore implicit and explicit unwind
2584   // edges.
2585   if (isa<CallInst>(Call)) {
2586     Builder.SetInsertPoint(Call->getNextNode());
2587   } else {
2588     auto Invoke = cast<InvokeInst>(Call);
2589     Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg());
2590   }
2591 
2592   // Get the current swifterror value and store it to the alloca.
2593   auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape);
2594   Builder.CreateStore(ValueAfterCall, Alloca);
2595 
2596   return Addr;
2597 }
2598 
2599 /// Eliminate a formerly-swifterror alloca by inserting the get/set
2600 /// intrinsics and attempting to MemToReg the alloca away.
2601 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
2602                                       coro::Shape &Shape) {
2603   for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) {
2604     // swifterror values can only be used in very specific ways.
2605     // We take advantage of that here.
2606     auto User = Use.getUser();
2607     if (isa<LoadInst>(User) || isa<StoreInst>(User))
2608       continue;
2609 
2610     assert(isa<CallInst>(User) || isa<InvokeInst>(User));
2611     auto Call = cast<Instruction>(User);
2612 
2613     auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape);
2614 
2615     // Use the returned slot address as the call argument.
2616     Use.set(Addr);
2617   }
2618 
2619   // All the uses should be loads and stores now.
2620   assert(isAllocaPromotable(Alloca));
2621 }
2622 
2623 /// "Eliminate" a swifterror argument by reducing it to the alloca case
2624 /// and then loading and storing in the prologue and epilog.
2625 ///
2626 /// The argument keeps the swifterror flag.
2627 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
2628                                         coro::Shape &Shape,
2629                              SmallVectorImpl<AllocaInst*> &AllocasToPromote) {
2630   IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
2631 
2632   auto ArgTy = cast<PointerType>(Arg.getType());
2633   auto ValueTy = PointerType::getUnqual(F.getContext());
2634 
2635   // Reduce to the alloca case:
2636 
2637   // Create an alloca and replace all uses of the arg with it.
2638   auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace());
2639   Arg.replaceAllUsesWith(Alloca);
2640 
2641   // Set an initial value in the alloca.  swifterror is always null on entry.
2642   auto InitialValue = Constant::getNullValue(ValueTy);
2643   Builder.CreateStore(InitialValue, Alloca);
2644 
2645   // Find all the suspends in the function and save and restore around them.
2646   for (auto *Suspend : Shape.CoroSuspends) {
2647     (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape);
2648   }
2649 
2650   // Find all the coro.ends in the function and restore the error value.
2651   for (auto *End : Shape.CoroEnds) {
2652     Builder.SetInsertPoint(End);
2653     auto FinalValue = Builder.CreateLoad(ValueTy, Alloca);
2654     (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape);
2655   }
2656 
2657   // Now we can use the alloca logic.
2658   AllocasToPromote.push_back(Alloca);
2659   eliminateSwiftErrorAlloca(F, Alloca, Shape);
2660 }
2661 
2662 /// Eliminate all problematic uses of swifterror arguments and allocas
2663 /// from the function.  We'll fix them up later when splitting the function.
2664 static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
2665   SmallVector<AllocaInst*, 4> AllocasToPromote;
2666 
2667   // Look for a swifterror argument.
2668   for (auto &Arg : F.args()) {
2669     if (!Arg.hasSwiftErrorAttr()) continue;
2670 
2671     eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote);
2672     break;
2673   }
2674 
2675   // Look for swifterror allocas.
2676   for (auto &Inst : F.getEntryBlock()) {
2677     auto Alloca = dyn_cast<AllocaInst>(&Inst);
2678     if (!Alloca || !Alloca->isSwiftError()) continue;
2679 
2680     // Clear the swifterror flag.
2681     Alloca->setSwiftError(false);
2682 
2683     AllocasToPromote.push_back(Alloca);
2684     eliminateSwiftErrorAlloca(F, Alloca, Shape);
2685   }
2686 
2687   // If we have any allocas to promote, compute a dominator tree and
2688   // promote them en masse.
2689   if (!AllocasToPromote.empty()) {
2690     DominatorTree DT(F);
2691     PromoteMemToReg(AllocasToPromote, DT);
2692   }
2693 }
2694 
2695 /// retcon and retcon.once conventions assume that all spill uses can be sunk
2696 /// after the coro.begin intrinsic.
2697 static void sinkSpillUsesAfterCoroBegin(Function &F,
2698                                         const FrameDataInfo &FrameData,
2699                                         CoroBeginInst *CoroBegin) {
2700   DominatorTree Dom(F);
2701 
2702   SmallSetVector<Instruction *, 32> ToMove;
2703   SmallVector<Instruction *, 32> Worklist;
2704 
2705   // Collect all users that precede coro.begin.
2706   for (auto *Def : FrameData.getAllDefs()) {
2707     for (User *U : Def->users()) {
2708       auto Inst = cast<Instruction>(U);
2709       if (Inst->getParent() != CoroBegin->getParent() ||
2710           Dom.dominates(CoroBegin, Inst))
2711         continue;
2712       if (ToMove.insert(Inst))
2713         Worklist.push_back(Inst);
2714     }
2715   }
2716   // Recursively collect users before coro.begin.
2717   while (!Worklist.empty()) {
2718     auto *Def = Worklist.pop_back_val();
2719     for (User *U : Def->users()) {
2720       auto Inst = cast<Instruction>(U);
2721       if (Dom.dominates(CoroBegin, Inst))
2722         continue;
2723       if (ToMove.insert(Inst))
2724         Worklist.push_back(Inst);
2725     }
2726   }
2727 
2728   // Sort by dominance.
2729   SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
2730   llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool {
2731     // If a dominates b it should preceed (<) b.
2732     return Dom.dominates(A, B);
2733   });
2734 
2735   Instruction *InsertPt = CoroBegin->getNextNode();
2736   for (Instruction *Inst : InsertionList)
2737     Inst->moveBefore(InsertPt);
2738 }
2739 
2740 /// For each local variable that all of its user are only used inside one of
2741 /// suspended region, we sink their lifetime.start markers to the place where
2742 /// after the suspend block. Doing so minimizes the lifetime of each variable,
2743 /// hence minimizing the amount of data we end up putting on the frame.
2744 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
2745                                      SuspendCrossingInfo &Checker,
2746                                      const DominatorTree &DT) {
2747   if (F.hasOptNone())
2748     return;
2749 
2750   // Collect all possible basic blocks which may dominate all uses of allocas.
2751   SmallPtrSet<BasicBlock *, 4> DomSet;
2752   DomSet.insert(&F.getEntryBlock());
2753   for (auto *CSI : Shape.CoroSuspends) {
2754     BasicBlock *SuspendBlock = CSI->getParent();
2755     assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
2756            "should have split coro.suspend into its own block");
2757     DomSet.insert(SuspendBlock->getSingleSuccessor());
2758   }
2759 
2760   for (Instruction &I : instructions(F)) {
2761     AllocaInst* AI = dyn_cast<AllocaInst>(&I);
2762     if (!AI)
2763       continue;
2764 
2765     for (BasicBlock *DomBB : DomSet) {
2766       bool Valid = true;
2767       SmallVector<Instruction *, 1> Lifetimes;
2768 
2769       auto isLifetimeStart = [](Instruction* I) {
2770         if (auto* II = dyn_cast<IntrinsicInst>(I))
2771           return II->getIntrinsicID() == Intrinsic::lifetime_start;
2772         return false;
2773       };
2774 
2775       auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) {
2776         if (isLifetimeStart(U)) {
2777           Lifetimes.push_back(U);
2778           return true;
2779         }
2780         if (!U->hasOneUse() || U->stripPointerCasts() != AI)
2781           return false;
2782         if (isLifetimeStart(U->user_back())) {
2783           Lifetimes.push_back(U->user_back());
2784           return true;
2785         }
2786         return false;
2787       };
2788 
2789       for (User *U : AI->users()) {
2790         Instruction *UI = cast<Instruction>(U);
2791         // For all users except lifetime.start markers, if they are all
2792         // dominated by one of the basic blocks and do not cross
2793         // suspend points as well, then there is no need to spill the
2794         // instruction.
2795         if (!DT.dominates(DomBB, UI->getParent()) ||
2796             Checker.isDefinitionAcrossSuspend(DomBB, UI)) {
2797           // Skip lifetime.start, GEP and bitcast used by lifetime.start
2798           // markers.
2799           if (collectLifetimeStart(UI, AI))
2800             continue;
2801           Valid = false;
2802           break;
2803         }
2804       }
2805       // Sink lifetime.start markers to dominate block when they are
2806       // only used outside the region.
2807       if (Valid && Lifetimes.size() != 0) {
2808         auto *NewLifetime = Lifetimes[0]->clone();
2809         NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI);
2810         NewLifetime->insertBefore(DomBB->getTerminator());
2811 
2812         // All the outsided lifetime.start markers are no longer necessary.
2813         for (Instruction *S : Lifetimes)
2814           S->eraseFromParent();
2815 
2816         break;
2817       }
2818     }
2819   }
2820 }
2821 
2822 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
2823                                const SuspendCrossingInfo &Checker,
2824                                SmallVectorImpl<AllocaInfo> &Allocas,
2825                                const DominatorTree &DT) {
2826   if (Shape.CoroSuspends.empty())
2827     return;
2828 
2829   // The PromiseAlloca will be specially handled since it needs to be in a
2830   // fixed position in the frame.
2831   if (AI == Shape.SwitchLowering.PromiseAlloca)
2832     return;
2833 
2834   // The __coro_gro alloca should outlive the promise, make sure we
2835   // keep it outside the frame.
2836   if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame))
2837     return;
2838 
2839   // The code that uses lifetime.start intrinsic does not work for functions
2840   // with loops without exit. Disable it on ABIs we know to generate such
2841   // code.
2842   bool ShouldUseLifetimeStartInfo =
2843       (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
2844        Shape.ABI != coro::ABI::RetconOnce);
2845   AllocaUseVisitor Visitor{AI->getDataLayout(), DT, Shape, Checker,
2846                            ShouldUseLifetimeStartInfo};
2847   Visitor.visitPtr(*AI);
2848   if (!Visitor.getShouldLiveOnFrame())
2849     return;
2850   Allocas.emplace_back(AI, Visitor.getAliasesCopy(),
2851                        Visitor.getMayWriteBeforeCoroBegin());
2852 }
2853 
2854 static std::optional<std::pair<Value &, DIExpression &>>
2855 salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2856                      bool OptimizeFrame, bool UseEntryValue, Function *F,
2857                      Value *Storage, DIExpression *Expr,
2858                      bool SkipOutermostLoad) {
2859   IRBuilder<> Builder(F->getContext());
2860   auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
2861   while (isa<IntrinsicInst>(InsertPt))
2862     ++InsertPt;
2863   Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt);
2864 
2865   while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {
2866     if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {
2867       Storage = LdInst->getPointerOperand();
2868       // FIXME: This is a heuristic that works around the fact that
2869       // LLVM IR debug intrinsics cannot yet distinguish between
2870       // memory and value locations: Because a dbg.declare(alloca) is
2871       // implicitly a memory location no DW_OP_deref operation for the
2872       // last direct load from an alloca is necessary.  This condition
2873       // effectively drops the *last* DW_OP_deref in the expression.
2874       if (!SkipOutermostLoad)
2875         Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
2876     } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) {
2877       Storage = StInst->getValueOperand();
2878     } else {
2879       SmallVector<uint64_t, 16> Ops;
2880       SmallVector<Value *, 0> AdditionalValues;
2881       Value *Op = llvm::salvageDebugInfoImpl(
2882           *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops,
2883           AdditionalValues);
2884       if (!Op || !AdditionalValues.empty()) {
2885         // If salvaging failed or salvaging produced more than one location
2886         // operand, give up.
2887         break;
2888       }
2889       Storage = Op;
2890       Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false);
2891     }
2892     SkipOutermostLoad = false;
2893   }
2894   if (!Storage)
2895     return std::nullopt;
2896 
2897   auto *StorageAsArg = dyn_cast<Argument>(Storage);
2898   const bool IsSwiftAsyncArg =
2899       StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync);
2900 
2901   // Swift async arguments are described by an entry value of the ABI-defined
2902   // register containing the coroutine context.
2903   // Entry values in variadic expressions are not supported.
2904   if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() &&
2905       Expr->isSingleLocationExpression())
2906     Expr = DIExpression::prepend(Expr, DIExpression::EntryValue);
2907 
2908   // If the coroutine frame is an Argument, store it in an alloca to improve
2909   // its availability (e.g. registers may be clobbered).
2910   // Avoid this if optimizations are enabled (they would remove the alloca) or
2911   // if the value is guaranteed to be available through other means (e.g. swift
2912   // ABI guarantees).
2913   if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) {
2914     auto &Cached = ArgToAllocaMap[StorageAsArg];
2915     if (!Cached) {
2916       Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,
2917                                     Storage->getName() + ".debug");
2918       Builder.CreateStore(Storage, Cached);
2919     }
2920     Storage = Cached;
2921     // FIXME: LLVM lacks nuanced semantics to differentiate between
2922     // memory and direct locations at the IR level. The backend will
2923     // turn a dbg.declare(alloca, ..., DIExpression()) into a memory
2924     // location. Thus, if there are deref and offset operations in the
2925     // expression, we need to add a DW_OP_deref at the *start* of the
2926     // expression to first load the contents of the alloca before
2927     // adjusting it with the expression.
2928     Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
2929   }
2930 
2931   return {{*Storage, *Expr}};
2932 }
2933 
2934 void coro::salvageDebugInfo(
2935     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2936     DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) {
2937 
2938   Function *F = DVI.getFunction();
2939   // Follow the pointer arithmetic all the way to the incoming
2940   // function argument and convert into a DIExpression.
2941   bool SkipOutermostLoad = !isa<DbgValueInst>(DVI);
2942   Value *OriginalStorage = DVI.getVariableLocationOp(0);
2943 
2944   auto SalvagedInfo = ::salvageDebugInfoImpl(
2945       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
2946       DVI.getExpression(), SkipOutermostLoad);
2947   if (!SalvagedInfo)
2948     return;
2949 
2950   Value *Storage = &SalvagedInfo->first;
2951   DIExpression *Expr = &SalvagedInfo->second;
2952 
2953   DVI.replaceVariableLocationOp(OriginalStorage, Storage);
2954   DVI.setExpression(Expr);
2955   // We only hoist dbg.declare today since it doesn't make sense to hoist
2956   // dbg.value since it does not have the same function wide guarantees that
2957   // dbg.declare does.
2958   if (isa<DbgDeclareInst>(DVI)) {
2959     std::optional<BasicBlock::iterator> InsertPt;
2960     if (auto *I = dyn_cast<Instruction>(Storage)) {
2961       InsertPt = I->getInsertionPointAfterDef();
2962       // Update DILocation only if variable was not inlined.
2963       DebugLoc ILoc = I->getDebugLoc();
2964       DebugLoc DVILoc = DVI.getDebugLoc();
2965       if (ILoc && DVILoc &&
2966           DVILoc->getScope()->getSubprogram() ==
2967               ILoc->getScope()->getSubprogram())
2968         DVI.setDebugLoc(I->getDebugLoc());
2969     } else if (isa<Argument>(Storage))
2970       InsertPt = F->getEntryBlock().begin();
2971     if (InsertPt)
2972       DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt);
2973   }
2974 }
2975 
2976 void coro::salvageDebugInfo(
2977     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2978     DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) {
2979 
2980   Function *F = DVR.getFunction();
2981   // Follow the pointer arithmetic all the way to the incoming
2982   // function argument and convert into a DIExpression.
2983   bool SkipOutermostLoad = DVR.isDbgDeclare();
2984   Value *OriginalStorage = DVR.getVariableLocationOp(0);
2985 
2986   auto SalvagedInfo = ::salvageDebugInfoImpl(
2987       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
2988       DVR.getExpression(), SkipOutermostLoad);
2989   if (!SalvagedInfo)
2990     return;
2991 
2992   Value *Storage = &SalvagedInfo->first;
2993   DIExpression *Expr = &SalvagedInfo->second;
2994 
2995   DVR.replaceVariableLocationOp(OriginalStorage, Storage);
2996   DVR.setExpression(Expr);
2997   // We only hoist dbg.declare today since it doesn't make sense to hoist
2998   // dbg.value since it does not have the same function wide guarantees that
2999   // dbg.declare does.
3000   if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
3001     std::optional<BasicBlock::iterator> InsertPt;
3002     if (auto *I = dyn_cast<Instruction>(Storage)) {
3003       InsertPt = I->getInsertionPointAfterDef();
3004       // Update DILocation only if variable was not inlined.
3005       DebugLoc ILoc = I->getDebugLoc();
3006       DebugLoc DVRLoc = DVR.getDebugLoc();
3007       if (ILoc && DVRLoc &&
3008           DVRLoc->getScope()->getSubprogram() ==
3009               ILoc->getScope()->getSubprogram())
3010         DVR.setDebugLoc(ILoc);
3011     } else if (isa<Argument>(Storage))
3012       InsertPt = F->getEntryBlock().begin();
3013     if (InsertPt) {
3014       DVR.removeFromParent();
3015       (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt);
3016     }
3017   }
3018 }
3019 
3020 static void doRematerializations(
3021     Function &F, SuspendCrossingInfo &Checker,
3022     const std::function<bool(Instruction &)> &MaterializableCallback) {
3023   if (F.hasOptNone())
3024     return;
3025 
3026   SpillInfo Spills;
3027 
3028   // See if there are materializable instructions across suspend points
3029   // We record these as the starting point to also identify materializable
3030   // defs of uses in these operations
3031   for (Instruction &I : instructions(F)) {
3032     if (!MaterializableCallback(I))
3033       continue;
3034     for (User *U : I.users())
3035       if (Checker.isDefinitionAcrossSuspend(I, U))
3036         Spills[&I].push_back(cast<Instruction>(U));
3037   }
3038 
3039   // Process each of the identified rematerializable instructions
3040   // and add predecessor instructions that can also be rematerialized.
3041   // This is actually a graph of instructions since we could potentially
3042   // have multiple uses of a def in the set of predecessor instructions.
3043   // The approach here is to maintain a graph of instructions for each bottom
3044   // level instruction - where we have a unique set of instructions (nodes)
3045   // and edges between them. We then walk the graph in reverse post-dominator
3046   // order to insert them past the suspend point, but ensure that ordering is
3047   // correct. We also rely on CSE removing duplicate defs for remats of
3048   // different instructions with a def in common (rather than maintaining more
3049   // complex graphs for each suspend point)
3050 
3051   // We can do this by adding new nodes to the list for each suspend
3052   // point. Then using standard GraphTraits to give a reverse post-order
3053   // traversal when we insert the nodes after the suspend
3054   SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
3055   for (auto &E : Spills) {
3056     for (Instruction *U : E.second) {
3057       // Don't process a user twice (this can happen if the instruction uses
3058       // more than one rematerializable def)
3059       if (AllRemats.count(U))
3060         continue;
3061 
3062       // Constructor creates the whole RematGraph for the given Use
3063       auto RematUPtr =
3064           std::make_unique<RematGraph>(MaterializableCallback, U, Checker);
3065 
3066       LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
3067                  ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
3068                  for (auto I = RPOT.begin(); I != RPOT.end();
3069                       ++I) { (*I)->Node->dump(); } dbgs()
3070                  << "\n";);
3071 
3072       AllRemats[U] = std::move(RematUPtr);
3073     }
3074   }
3075 
3076   // Rewrite materializable instructions to be materialized at the use
3077   // point.
3078   LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
3079   rewriteMaterializableInstructions(AllRemats);
3080 }
3081 
3082 void coro::buildCoroutineFrame(
3083     Function &F, Shape &Shape, TargetTransformInfo &TTI,
3084     const std::function<bool(Instruction &)> &MaterializableCallback) {
3085   // Don't eliminate swifterror in async functions that won't be split.
3086   if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
3087     eliminateSwiftError(F, Shape);
3088 
3089   if (Shape.ABI == coro::ABI::Switch &&
3090       Shape.SwitchLowering.PromiseAlloca) {
3091     Shape.getSwitchCoroId()->clearPromise();
3092   }
3093 
3094   // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
3095   // intrinsics are in their own blocks to simplify the logic of building up
3096   // SuspendCrossing data.
3097   for (auto *CSI : Shape.CoroSuspends) {
3098     if (auto *Save = CSI->getCoroSave())
3099       splitAround(Save, "CoroSave");
3100     splitAround(CSI, "CoroSuspend");
3101   }
3102 
3103   // Put CoroEnds into their own blocks.
3104   for (AnyCoroEndInst *CE : Shape.CoroEnds) {
3105     splitAround(CE, "CoroEnd");
3106 
3107     // Emit the musttail call function in a new block before the CoroEnd.
3108     // We do this here so that the right suspend crossing info is computed for
3109     // the uses of the musttail call function call. (Arguments to the coro.end
3110     // instructions would be ignored)
3111     if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) {
3112       auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction();
3113       if (!MustTailCallFn)
3114         continue;
3115       IRBuilder<> Builder(AsyncEnd);
3116       SmallVector<Value *, 8> Args(AsyncEnd->args());
3117       auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
3118       auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
3119                                       TTI, Arguments, Builder);
3120       splitAround(Call, "MustTailCall.Before.CoroEnd");
3121     }
3122   }
3123 
3124   // Later code makes structural assumptions about single predecessors phis e.g
3125   // that they are not live across a suspend point.
3126   cleanupSinglePredPHIs(F);
3127 
3128   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
3129   // never has its definition separated from the PHI by the suspend point.
3130   rewritePHIs(F);
3131 
3132   // Build suspend crossing info.
3133   SuspendCrossingInfo Checker(F, Shape);
3134 
3135   doRematerializations(F, Checker, MaterializableCallback);
3136 
3137   const DominatorTree DT(F);
3138   FrameDataInfo FrameData;
3139   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
3140   SmallVector<Instruction*, 4> DeadInstructions;
3141   if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
3142       Shape.ABI != coro::ABI::RetconOnce)
3143     sinkLifetimeStartMarkers(F, Shape, Checker, DT);
3144 
3145   // Collect the spills for arguments and other not-materializable values.
3146   for (Argument &A : F.args())
3147     for (User *U : A.users())
3148       if (Checker.isDefinitionAcrossSuspend(A, U))
3149         FrameData.Spills[&A].push_back(cast<Instruction>(U));
3150 
3151   for (Instruction &I : instructions(F)) {
3152     // Values returned from coroutine structure intrinsics should not be part
3153     // of the Coroutine Frame.
3154     if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin)
3155       continue;
3156 
3157     // Handle alloca.alloc specially here.
3158     if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) {
3159       // Check whether the alloca's lifetime is bounded by suspend points.
3160       if (isLocalAlloca(AI)) {
3161         LocalAllocas.push_back(AI);
3162         continue;
3163       }
3164 
3165       // If not, do a quick rewrite of the alloca and then add spills of
3166       // the rewritten value.  The rewrite doesn't invalidate anything in
3167       // Spills because the other alloca intrinsics have no other operands
3168       // besides AI, and it doesn't invalidate the iteration because we delay
3169       // erasing AI.
3170       auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions);
3171 
3172       for (User *U : Alloc->users()) {
3173         if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
3174           FrameData.Spills[Alloc].push_back(cast<Instruction>(U));
3175       }
3176       continue;
3177     }
3178 
3179     // Ignore alloca.get; we process this as part of coro.alloca.alloc.
3180     if (isa<CoroAllocaGetInst>(I))
3181       continue;
3182 
3183     if (auto *AI = dyn_cast<AllocaInst>(&I)) {
3184       collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT);
3185       continue;
3186     }
3187 
3188     for (User *U : I.users())
3189       if (Checker.isDefinitionAcrossSuspend(I, U)) {
3190         // We cannot spill a token.
3191         if (I.getType()->isTokenTy())
3192           report_fatal_error(
3193               "token definition is separated from the use by a suspend point");
3194         FrameData.Spills[&I].push_back(cast<Instruction>(U));
3195       }
3196   }
3197 
3198   LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
3199 
3200   // We don't want the layout of coroutine frame to be affected
3201   // by debug information. So we only choose to salvage DbgValueInst for
3202   // whose value is already in the frame.
3203   // We would handle the dbg.values for allocas specially
3204   for (auto &Iter : FrameData.Spills) {
3205     auto *V = Iter.first;
3206     SmallVector<DbgValueInst *, 16> DVIs;
3207     SmallVector<DbgVariableRecord *, 16> DVRs;
3208     findDbgValues(DVIs, V, &DVRs);
3209     for (DbgValueInst *DVI : DVIs)
3210       if (Checker.isDefinitionAcrossSuspend(*V, DVI))
3211         FrameData.Spills[V].push_back(DVI);
3212     // Add the instructions which carry debug info that is in the frame.
3213     for (DbgVariableRecord *DVR : DVRs)
3214       if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
3215         FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr);
3216   }
3217 
3218   LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
3219   if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
3220       Shape.ABI == coro::ABI::Async)
3221     sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin);
3222   Shape.FrameTy = buildFrameType(F, Shape, FrameData);
3223   Shape.FramePtr = Shape.CoroBegin;
3224   // For now, this works for C++ programs only.
3225   buildFrameDebugInfo(F, Shape, FrameData);
3226   insertSpills(FrameData, Shape);
3227   lowerLocalAllocas(LocalAllocas, DeadInstructions);
3228 
3229   for (auto *I : DeadInstructions)
3230     I->eraseFromParent();
3231 }
3232