xref: /llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp (revision 34b139594aa20fe712bc2ad68544632b3e4d8512)
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
25 #include "llvm/Transforms/Scalar/SROA.h"
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/PointerIntPair.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallBitVector.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/Twine.h"
39 #include "llvm/ADT/iterator.h"
40 #include "llvm/ADT/iterator_range.h"
41 #include "llvm/Analysis/AssumptionCache.h"
42 #include "llvm/Analysis/DomTreeUpdater.h"
43 #include "llvm/Analysis/GlobalsModRef.h"
44 #include "llvm/Analysis/Loads.h"
45 #include "llvm/Analysis/PtrUseVisitor.h"
46 #include "llvm/Analysis/ValueTracking.h"
47 #include "llvm/Config/llvm-config.h"
48 #include "llvm/IR/BasicBlock.h"
49 #include "llvm/IR/Constant.h"
50 #include "llvm/IR/ConstantFolder.h"
51 #include "llvm/IR/Constants.h"
52 #include "llvm/IR/DIBuilder.h"
53 #include "llvm/IR/DataLayout.h"
54 #include "llvm/IR/DebugInfo.h"
55 #include "llvm/IR/DebugInfoMetadata.h"
56 #include "llvm/IR/DerivedTypes.h"
57 #include "llvm/IR/Dominators.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/GlobalAlias.h"
60 #include "llvm/IR/IRBuilder.h"
61 #include "llvm/IR/InstVisitor.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/LLVMContext.h"
66 #include "llvm/IR/Metadata.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/Operator.h"
69 #include "llvm/IR/PassManager.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/IR/ValueHandle.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/Pass.h"
77 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/CommandLine.h"
79 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/ErrorHandling.h"
82 #include "llvm/Support/raw_ostream.h"
83 #include "llvm/Transforms/Scalar.h"
84 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
85 #include "llvm/Transforms/Utils/Local.h"
86 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
87 #include "llvm/Transforms/Utils/SSAUpdater.h"
88 #include <algorithm>
89 #include <cassert>
90 #include <cstddef>
91 #include <cstdint>
92 #include <cstring>
93 #include <iterator>
94 #include <string>
95 #include <tuple>
96 #include <utility>
97 #include <variant>
98 #include <vector>
99 
100 using namespace llvm;
101 
102 #define DEBUG_TYPE "sroa"
103 
104 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112 STATISTIC(NumLoadsPredicated,
113           "Number of loads rewritten into predicated loads to allow promotion");
114 STATISTIC(
115     NumStoresPredicated,
116     "Number of stores rewritten into predicated loads to allow promotion");
117 STATISTIC(NumDeleted, "Number of instructions deleted");
118 STATISTIC(NumVectorized, "Number of vectorized aggregates");
119 
120 /// Disable running mem2reg during SROA in order to test or debug SROA.
121 static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
122                                      cl::Hidden);
123 namespace {
124 
125 class AllocaSliceRewriter;
126 class AllocaSlices;
127 class Partition;
128 
129 class SelectHandSpeculativity {
130   unsigned char Storage = 0; // None are speculatable by default.
131   using TrueVal = Bitfield::Element<bool, 0, 1>;  // Low 0'th bit.
132   using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
133 public:
134   SelectHandSpeculativity() = default;
135   SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
136   bool isSpeculatable(bool isTrueVal) const;
137   bool areAllSpeculatable() const;
138   bool areAnySpeculatable() const;
139   bool areNoneSpeculatable() const;
140   // For interop as int half of PointerIntPair.
141   explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
142   explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
143 };
144 static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
145 
146 using PossiblySpeculatableLoad =
147     PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
148 using UnspeculatableStore = StoreInst *;
149 using RewriteableMemOp =
150     std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
151 using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
152 
153 /// An optimization pass providing Scalar Replacement of Aggregates.
154 ///
155 /// This pass takes allocations which can be completely analyzed (that is, they
156 /// don't escape) and tries to turn them into scalar SSA values. There are
157 /// a few steps to this process.
158 ///
159 /// 1) It takes allocations of aggregates and analyzes the ways in which they
160 ///    are used to try to split them into smaller allocations, ideally of
161 ///    a single scalar data type. It will split up memcpy and memset accesses
162 ///    as necessary and try to isolate individual scalar accesses.
163 /// 2) It will transform accesses into forms which are suitable for SSA value
164 ///    promotion. This can be replacing a memset with a scalar store of an
165 ///    integer value, or it can involve speculating operations on a PHI or
166 ///    select to be a PHI or select of the results.
167 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly
168 ///    onto insert and extract operations on a vector value, and convert them to
169 ///    this form. By doing so, it will enable promotion of vector aggregates to
170 ///    SSA vector values.
171 class SROA {
172   LLVMContext *const C;
173   DomTreeUpdater *const DTU;
174   AssumptionCache *const AC;
175   const bool PreserveCFG;
176 
177   /// Worklist of alloca instructions to simplify.
178   ///
179   /// Each alloca in the function is added to this. Each new alloca formed gets
180   /// added to it as well to recursively simplify unless that alloca can be
181   /// directly promoted. Finally, each time we rewrite a use of an alloca other
182   /// the one being actively rewritten, we add it back onto the list if not
183   /// already present to ensure it is re-visited.
184   SmallSetVector<AllocaInst *, 16> Worklist;
185 
186   /// A collection of instructions to delete.
187   /// We try to batch deletions to simplify code and make things a bit more
188   /// efficient. We also make sure there is no dangling pointers.
189   SmallVector<WeakVH, 8> DeadInsts;
190 
191   /// Post-promotion worklist.
192   ///
193   /// Sometimes we discover an alloca which has a high probability of becoming
194   /// viable for SROA after a round of promotion takes place. In those cases,
195   /// the alloca is enqueued here for re-processing.
196   ///
197   /// Note that we have to be very careful to clear allocas out of this list in
198   /// the event they are deleted.
199   SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
200 
201   /// A collection of alloca instructions we can directly promote.
202   SetVector<AllocaInst *, SmallVector<AllocaInst *>,
203             SmallPtrSet<AllocaInst *, 16>, 16>
204       PromotableAllocas;
205 
206   /// A worklist of PHIs to speculate prior to promoting allocas.
207   ///
208   /// All of these PHIs have been checked for the safety of speculation and by
209   /// being speculated will allow promoting allocas currently in the promotable
210   /// queue.
211   SmallSetVector<PHINode *, 8> SpeculatablePHIs;
212 
213   /// A worklist of select instructions to rewrite prior to promoting
214   /// allocas.
215   SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
216 
217   /// Select instructions that use an alloca and are subsequently loaded can be
218   /// rewritten to load both input pointers and then select between the result,
219   /// allowing the load of the alloca to be promoted.
220   /// From this:
221   ///   %P2 = select i1 %cond, ptr %Alloca, ptr %Other
222   ///   %V = load <type>, ptr %P2
223   /// to:
224   ///   %V1 = load <type>, ptr %Alloca      -> will be mem2reg'd
225   ///   %V2 = load <type>, ptr %Other
226   ///   %V = select i1 %cond, <type> %V1, <type> %V2
227   ///
228   /// We can do this to a select if its only uses are loads
229   /// and if either the operand to the select can be loaded unconditionally,
230   ///        or if we are allowed to perform CFG modifications.
231   /// If found an intervening bitcast with a single use of the load,
232   /// allow the promotion.
233   static std::optional<RewriteableMemOps>
234   isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
235 
236 public:
237   SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
238        SROAOptions PreserveCFG_)
239       : C(C), DTU(DTU), AC(AC),
240         PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
241 
242   /// Main run method used by both the SROAPass and by the legacy pass.
243   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
244 
245 private:
246   friend class AllocaSliceRewriter;
247 
248   bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
249   AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
250   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
251   bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
252   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
253   void clobberUse(Use &U);
254   bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
255   bool promoteAllocas(Function &F);
256 };
257 
258 } // end anonymous namespace
259 
260 /// Calculate the fragment of a variable to use when slicing a store
261 /// based on the slice dimensions, existing fragment, and base storage
262 /// fragment.
263 /// Results:
264 /// UseFrag - Use Target as the new fragment.
265 /// UseNoFrag - The new slice already covers the whole variable.
266 /// Skip - The new alloca slice doesn't include this variable.
267 /// FIXME: Can we use calculateFragmentIntersect instead?
268 namespace {
269 enum FragCalcResult { UseFrag, UseNoFrag, Skip };
270 }
271 static FragCalcResult
272 calculateFragment(DILocalVariable *Variable,
273                   uint64_t NewStorageSliceOffsetInBits,
274                   uint64_t NewStorageSliceSizeInBits,
275                   std::optional<DIExpression::FragmentInfo> StorageFragment,
276                   std::optional<DIExpression::FragmentInfo> CurrentFragment,
277                   DIExpression::FragmentInfo &Target) {
278   // If the base storage describes part of the variable apply the offset and
279   // the size constraint.
280   if (StorageFragment) {
281     Target.SizeInBits =
282         std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
283     Target.OffsetInBits =
284         NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
285   } else {
286     Target.SizeInBits = NewStorageSliceSizeInBits;
287     Target.OffsetInBits = NewStorageSliceOffsetInBits;
288   }
289 
290   // If this slice extracts the entirety of an independent variable from a
291   // larger alloca, do not produce a fragment expression, as the variable is
292   // not fragmented.
293   if (!CurrentFragment) {
294     if (auto Size = Variable->getSizeInBits()) {
295       // Treat the current fragment as covering the whole variable.
296       CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
297       if (Target == CurrentFragment)
298         return UseNoFrag;
299     }
300   }
301 
302   // No additional work to do if there isn't a fragment already, or there is
303   // but it already exactly describes the new assignment.
304   if (!CurrentFragment || *CurrentFragment == Target)
305     return UseFrag;
306 
307   // Reject the target fragment if it doesn't fit wholly within the current
308   // fragment. TODO: We could instead chop up the target to fit in the case of
309   // a partial overlap.
310   if (Target.startInBits() < CurrentFragment->startInBits() ||
311       Target.endInBits() > CurrentFragment->endInBits())
312     return Skip;
313 
314   // Target fits within the current fragment, return it.
315   return UseFrag;
316 }
317 
318 static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
319   return DebugVariable(DVI->getVariable(), std::nullopt,
320                        DVI->getDebugLoc().getInlinedAt());
321 }
322 static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
323   return DebugVariable(DVR->getVariable(), std::nullopt,
324                        DVR->getDebugLoc().getInlinedAt());
325 }
326 
327 /// Helpers for handling new and old debug info modes in migrateDebugInfo.
328 /// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
329 /// on the \p Unused parameter type.
330 DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
331   (void)Unused;
332   return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
333 }
334 DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
335   (void)Unused;
336   return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
337 }
338 
339 /// Find linked dbg.assign and generate a new one with the correct
340 /// FragmentInfo. Link Inst to the new dbg.assign.  If Value is nullptr the
341 /// value component is copied from the old dbg.assign to the new.
342 /// \param OldAlloca             Alloca for the variable before splitting.
343 /// \param IsSplit               True if the store (not necessarily alloca)
344 ///                              is being split.
345 /// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
346 /// \param SliceSizeInBits       New number of bits being written to.
347 /// \param OldInst               Instruction that is being split.
348 /// \param Inst                  New instruction performing this part of the
349 ///                              split store.
350 /// \param Dest                  Store destination.
351 /// \param Value                 Stored value.
352 /// \param DL                    Datalayout.
353 static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
354                              uint64_t OldAllocaOffsetInBits,
355                              uint64_t SliceSizeInBits, Instruction *OldInst,
356                              Instruction *Inst, Value *Dest, Value *Value,
357                              const DataLayout &DL) {
358   auto MarkerRange = at::getAssignmentMarkers(OldInst);
359   auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
360   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
361   if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
362     return;
363 
364   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
365   LLVM_DEBUG(dbgs() << "    OldAlloca: " << *OldAlloca << "\n");
366   LLVM_DEBUG(dbgs() << "    IsSplit: " << IsSplit << "\n");
367   LLVM_DEBUG(dbgs() << "    OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
368                     << "\n");
369   LLVM_DEBUG(dbgs() << "    SliceSizeInBits: " << SliceSizeInBits << "\n");
370   LLVM_DEBUG(dbgs() << "    OldInst: " << *OldInst << "\n");
371   LLVM_DEBUG(dbgs() << "    Inst: " << *Inst << "\n");
372   LLVM_DEBUG(dbgs() << "    Dest: " << *Dest << "\n");
373   if (Value)
374     LLVM_DEBUG(dbgs() << "    Value: " << *Value << "\n");
375 
376   /// Map of aggregate variables to their fragment associated with OldAlloca.
377   DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
378       BaseFragments;
379   for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
380     BaseFragments[getAggregateVariable(DAI)] =
381         DAI->getExpression()->getFragmentInfo();
382   for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
383     BaseFragments[getAggregateVariable(DVR)] =
384         DVR->getExpression()->getFragmentInfo();
385 
386   // The new inst needs a DIAssignID unique metadata tag (if OldInst has
387   // one). It shouldn't already have one: assert this assumption.
388   assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
389   DIAssignID *NewID = nullptr;
390   auto &Ctx = Inst->getContext();
391   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
392   assert(OldAlloca->isStaticAlloca());
393 
394   auto MigrateDbgAssign = [&](auto *DbgAssign) {
395     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
396                       << "\n");
397     auto *Expr = DbgAssign->getExpression();
398     bool SetKillLocation = false;
399 
400     if (IsSplit) {
401       std::optional<DIExpression::FragmentInfo> BaseFragment;
402       {
403         auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
404         if (R == BaseFragments.end())
405           return;
406         BaseFragment = R->second;
407       }
408       std::optional<DIExpression::FragmentInfo> CurrentFragment =
409           Expr->getFragmentInfo();
410       DIExpression::FragmentInfo NewFragment;
411       FragCalcResult Result = calculateFragment(
412           DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
413           BaseFragment, CurrentFragment, NewFragment);
414 
415       if (Result == Skip)
416         return;
417       if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
418         if (CurrentFragment) {
419           // Rewrite NewFragment to be relative to the existing one (this is
420           // what createFragmentExpression wants).  CalculateFragment has
421           // already resolved the size for us. FIXME: Should it return the
422           // relative fragment too?
423           NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
424         }
425         // Add the new fragment info to the existing expression if possible.
426         if (auto E = DIExpression::createFragmentExpression(
427                 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
428           Expr = *E;
429         } else {
430           // Otherwise, add the new fragment info to an empty expression and
431           // discard the value component of this dbg.assign as the value cannot
432           // be computed with the new fragment.
433           Expr = *DIExpression::createFragmentExpression(
434               DIExpression::get(Expr->getContext(), {}),
435               NewFragment.OffsetInBits, NewFragment.SizeInBits);
436           SetKillLocation = true;
437         }
438       }
439     }
440 
441     // If we haven't created a DIAssignID ID do that now and attach it to Inst.
442     if (!NewID) {
443       NewID = DIAssignID::getDistinct(Ctx);
444       Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
445     }
446 
447     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
448     auto *NewAssign = UnwrapDbgInstPtr(
449         DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
450                             Dest, DIExpression::get(Expr->getContext(), {}),
451                             DbgAssign->getDebugLoc()),
452         DbgAssign);
453 
454     // If we've updated the value but the original dbg.assign has an arglist
455     // then kill it now - we can't use the requested new value.
456     // We can't replace the DIArgList with the new value as it'd leave
457     // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
458     // an arglist). And we can't keep the DIArgList in case the linked store
459     // is being split - in which case the DIArgList + expression may no longer
460     // be computing the correct value.
461     // This should be a very rare situation as it requires the value being
462     // stored to differ from the dbg.assign (i.e., the value has been
463     // represented differently in the debug intrinsic for some reason).
464     SetKillLocation |=
465         Value && (DbgAssign->hasArgList() ||
466                   !DbgAssign->getExpression()->isSingleLocationExpression());
467     if (SetKillLocation)
468       NewAssign->setKillLocation();
469 
470     // We could use more precision here at the cost of some additional (code)
471     // complexity - if the original dbg.assign was adjacent to its store, we
472     // could position this new dbg.assign adjacent to its store rather than the
473     // old dbg.assgn. That would result in interleaved dbg.assigns rather than
474     // what we get now:
475     //    split store !1
476     //    split store !2
477     //    dbg.assign !1
478     //    dbg.assign !2
479     // This (current behaviour) results results in debug assignments being
480     // noted as slightly offset (in code) from the store. In practice this
481     // should have little effect on the debugging experience due to the fact
482     // that all the split stores should get the same line number.
483     NewAssign->moveBefore(DbgAssign->getIterator());
484 
485     NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
486     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
487   };
488 
489   for_each(MarkerRange, MigrateDbgAssign);
490   for_each(DVRAssignMarkerRange, MigrateDbgAssign);
491 }
492 
493 namespace {
494 
495 /// A custom IRBuilder inserter which prefixes all names, but only in
496 /// Assert builds.
497 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
498   std::string Prefix;
499 
500   Twine getNameWithPrefix(const Twine &Name) const {
501     return Name.isTriviallyEmpty() ? Name : Prefix + Name;
502   }
503 
504 public:
505   void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
506 
507   void InsertHelper(Instruction *I, const Twine &Name,
508                     BasicBlock::iterator InsertPt) const override {
509     IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
510                                            InsertPt);
511   }
512 };
513 
514 /// Provide a type for IRBuilder that drops names in release builds.
515 using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
516 
517 /// A used slice of an alloca.
518 ///
519 /// This structure represents a slice of an alloca used by some instruction. It
520 /// stores both the begin and end offsets of this use, a pointer to the use
521 /// itself, and a flag indicating whether we can classify the use as splittable
522 /// or not when forming partitions of the alloca.
523 class Slice {
524   /// The beginning offset of the range.
525   uint64_t BeginOffset = 0;
526 
527   /// The ending offset, not included in the range.
528   uint64_t EndOffset = 0;
529 
530   /// Storage for both the use of this slice and whether it can be
531   /// split.
532   PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
533 
534 public:
535   Slice() = default;
536 
537   Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
538       : BeginOffset(BeginOffset), EndOffset(EndOffset),
539         UseAndIsSplittable(U, IsSplittable) {}
540 
541   uint64_t beginOffset() const { return BeginOffset; }
542   uint64_t endOffset() const { return EndOffset; }
543 
544   bool isSplittable() const { return UseAndIsSplittable.getInt(); }
545   void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
546 
547   Use *getUse() const { return UseAndIsSplittable.getPointer(); }
548 
549   bool isDead() const { return getUse() == nullptr; }
550   void kill() { UseAndIsSplittable.setPointer(nullptr); }
551 
552   /// Support for ordering ranges.
553   ///
554   /// This provides an ordering over ranges such that start offsets are
555   /// always increasing, and within equal start offsets, the end offsets are
556   /// decreasing. Thus the spanning range comes first in a cluster with the
557   /// same start position.
558   bool operator<(const Slice &RHS) const {
559     if (beginOffset() < RHS.beginOffset())
560       return true;
561     if (beginOffset() > RHS.beginOffset())
562       return false;
563     if (isSplittable() != RHS.isSplittable())
564       return !isSplittable();
565     if (endOffset() > RHS.endOffset())
566       return true;
567     return false;
568   }
569 
570   /// Support comparison with a single offset to allow binary searches.
571   friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
572                                               uint64_t RHSOffset) {
573     return LHS.beginOffset() < RHSOffset;
574   }
575   friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
576                                               const Slice &RHS) {
577     return LHSOffset < RHS.beginOffset();
578   }
579 
580   bool operator==(const Slice &RHS) const {
581     return isSplittable() == RHS.isSplittable() &&
582            beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583   }
584   bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585 };
586 
587 /// Representation of the alloca slices.
588 ///
589 /// This class represents the slices of an alloca which are formed by its
590 /// various uses. If a pointer escapes, we can't fully build a representation
591 /// for the slices used and we reflect that in this structure. The uses are
592 /// stored, sorted by increasing beginning offset and with unsplittable slices
593 /// starting at a particular offset before splittable slices.
594 class AllocaSlices {
595 public:
596   /// Construct the slices of a particular alloca.
597   AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598 
599   /// Test whether a pointer to the allocation escapes our analysis.
600   ///
601   /// If this is true, the slices are never fully built and should be
602   /// ignored.
603   bool isEscaped() const { return PointerEscapingInstr; }
604   bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605 
606   /// Support for iterating over the slices.
607   /// @{
608   using iterator = SmallVectorImpl<Slice>::iterator;
609   using range = iterator_range<iterator>;
610 
611   iterator begin() { return Slices.begin(); }
612   iterator end() { return Slices.end(); }
613 
614   using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615   using const_range = iterator_range<const_iterator>;
616 
617   const_iterator begin() const { return Slices.begin(); }
618   const_iterator end() const { return Slices.end(); }
619   /// @}
620 
621   /// Erase a range of slices.
622   void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
623 
624   /// Insert new slices for this alloca.
625   ///
626   /// This moves the slices into the alloca's slices collection, and re-sorts
627   /// everything so that the usual ordering properties of the alloca's slices
628   /// hold.
629   void insert(ArrayRef<Slice> NewSlices) {
630     int OldSize = Slices.size();
631     Slices.append(NewSlices.begin(), NewSlices.end());
632     auto SliceI = Slices.begin() + OldSize;
633     std::stable_sort(SliceI, Slices.end());
634     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
635   }
636 
637   // Forward declare the iterator and range accessor for walking the
638   // partitions.
639   class partition_iterator;
640   iterator_range<partition_iterator> partitions();
641 
642   /// Access the dead users for this alloca.
643   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
644 
645   /// Access Uses that should be dropped if the alloca is promotable.
646   ArrayRef<Use *> getDeadUsesIfPromotable() const {
647     return DeadUseIfPromotable;
648   }
649 
650   /// Access the dead operands referring to this alloca.
651   ///
652   /// These are operands which have cannot actually be used to refer to the
653   /// alloca as they are outside its range and the user doesn't correct for
654   /// that. These mostly consist of PHI node inputs and the like which we just
655   /// need to replace with undef.
656   ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
657 
658 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
659   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
660   void printSlice(raw_ostream &OS, const_iterator I,
661                   StringRef Indent = "  ") const;
662   void printUse(raw_ostream &OS, const_iterator I,
663                 StringRef Indent = "  ") const;
664   void print(raw_ostream &OS) const;
665   void dump(const_iterator I) const;
666   void dump() const;
667 #endif
668 
669 private:
670   template <typename DerivedT, typename RetT = void> class BuilderBase;
671   class SliceBuilder;
672 
673   friend class AllocaSlices::SliceBuilder;
674 
675 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
676   /// Handle to alloca instruction to simplify method interfaces.
677   AllocaInst &AI;
678 #endif
679 
680   /// The instruction responsible for this alloca not having a known set
681   /// of slices.
682   ///
683   /// When an instruction (potentially) escapes the pointer to the alloca, we
684   /// store a pointer to that here and abort trying to form slices of the
685   /// alloca. This will be null if the alloca slices are analyzed successfully.
686   Instruction *PointerEscapingInstr;
687   Instruction *PointerEscapingInstrReadOnly;
688 
689   /// The slices of the alloca.
690   ///
691   /// We store a vector of the slices formed by uses of the alloca here. This
692   /// vector is sorted by increasing begin offset, and then the unsplittable
693   /// slices before the splittable ones. See the Slice inner class for more
694   /// details.
695   SmallVector<Slice, 8> Slices;
696 
697   /// Instructions which will become dead if we rewrite the alloca.
698   ///
699   /// Note that these are not separated by slice. This is because we expect an
700   /// alloca to be completely rewritten or not rewritten at all. If rewritten,
701   /// all these instructions can simply be removed and replaced with poison as
702   /// they come from outside of the allocated space.
703   SmallVector<Instruction *, 8> DeadUsers;
704 
705   /// Uses which will become dead if can promote the alloca.
706   SmallVector<Use *, 8> DeadUseIfPromotable;
707 
708   /// Operands which will become dead if we rewrite the alloca.
709   ///
710   /// These are operands that in their particular use can be replaced with
711   /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712   /// to PHI nodes and the like. They aren't entirely dead (there might be
713   /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714   /// want to swap this particular input for poison to simplify the use lists of
715   /// the alloca.
716   SmallVector<Use *, 8> DeadOperands;
717 };
718 
719 /// A partition of the slices.
720 ///
721 /// An ephemeral representation for a range of slices which can be viewed as
722 /// a partition of the alloca. This range represents a span of the alloca's
723 /// memory which cannot be split, and provides access to all of the slices
724 /// overlapping some part of the partition.
725 ///
726 /// Objects of this type are produced by traversing the alloca's slices, but
727 /// are only ephemeral and not persistent.
728 class Partition {
729 private:
730   friend class AllocaSlices;
731   friend class AllocaSlices::partition_iterator;
732 
733   using iterator = AllocaSlices::iterator;
734 
735   /// The beginning and ending offsets of the alloca for this
736   /// partition.
737   uint64_t BeginOffset = 0, EndOffset = 0;
738 
739   /// The start and end iterators of this partition.
740   iterator SI, SJ;
741 
742   /// A collection of split slice tails overlapping the partition.
743   SmallVector<Slice *, 4> SplitTails;
744 
745   /// Raw constructor builds an empty partition starting and ending at
746   /// the given iterator.
747   Partition(iterator SI) : SI(SI), SJ(SI) {}
748 
749 public:
750   /// The start offset of this partition.
751   ///
752   /// All of the contained slices start at or after this offset.
753   uint64_t beginOffset() const { return BeginOffset; }
754 
755   /// The end offset of this partition.
756   ///
757   /// All of the contained slices end at or before this offset.
758   uint64_t endOffset() const { return EndOffset; }
759 
760   /// The size of the partition.
761   ///
762   /// Note that this can never be zero.
763   uint64_t size() const {
764     assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765     return EndOffset - BeginOffset;
766   }
767 
768   /// Test whether this partition contains no slices, and merely spans
769   /// a region occupied by split slices.
770   bool empty() const { return SI == SJ; }
771 
772   /// \name Iterate slices that start within the partition.
773   /// These may be splittable or unsplittable. They have a begin offset >= the
774   /// partition begin offset.
775   /// @{
776   // FIXME: We should probably define a "concat_iterator" helper and use that
777   // to stitch together pointee_iterators over the split tails and the
778   // contiguous iterators of the partition. That would give a much nicer
779   // interface here. We could then additionally expose filtered iterators for
780   // split, unsplit, and unsplittable splices based on the usage patterns.
781   iterator begin() const { return SI; }
782   iterator end() const { return SJ; }
783   /// @}
784 
785   /// Get the sequence of split slice tails.
786   ///
787   /// These tails are of slices which start before this partition but are
788   /// split and overlap into the partition. We accumulate these while forming
789   /// partitions.
790   ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
791 };
792 
793 } // end anonymous namespace
794 
795 /// An iterator over partitions of the alloca's slices.
796 ///
797 /// This iterator implements the core algorithm for partitioning the alloca's
798 /// slices. It is a forward iterator as we don't support backtracking for
799 /// efficiency reasons, and re-use a single storage area to maintain the
800 /// current set of split slices.
801 ///
802 /// It is templated on the slice iterator type to use so that it can operate
803 /// with either const or non-const slice iterators.
804 class AllocaSlices::partition_iterator
805     : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806                                   Partition> {
807   friend class AllocaSlices;
808 
809   /// Most of the state for walking the partitions is held in a class
810   /// with a nice interface for examining them.
811   Partition P;
812 
813   /// We need to keep the end of the slices to know when to stop.
814   AllocaSlices::iterator SE;
815 
816   /// We also need to keep track of the maximum split end offset seen.
817   /// FIXME: Do we really?
818   uint64_t MaxSplitSliceEndOffset = 0;
819 
820   /// Sets the partition to be empty at given iterator, and sets the
821   /// end iterator.
822   partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823       : P(SI), SE(SE) {
824     // If not already at the end, advance our state to form the initial
825     // partition.
826     if (SI != SE)
827       advance();
828   }
829 
830   /// Advance the iterator to the next partition.
831   ///
832   /// Requires that the iterator not be at the end of the slices.
833   void advance() {
834     assert((P.SI != SE || !P.SplitTails.empty()) &&
835            "Cannot advance past the end of the slices!");
836 
837     // Clear out any split uses which have ended.
838     if (!P.SplitTails.empty()) {
839       if (P.EndOffset >= MaxSplitSliceEndOffset) {
840         // If we've finished all splits, this is easy.
841         P.SplitTails.clear();
842         MaxSplitSliceEndOffset = 0;
843       } else {
844         // Remove the uses which have ended in the prior partition. This
845         // cannot change the max split slice end because we just checked that
846         // the prior partition ended prior to that max.
847         llvm::erase_if(P.SplitTails,
848                        [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
849         assert(llvm::any_of(P.SplitTails,
850                             [&](Slice *S) {
851                               return S->endOffset() == MaxSplitSliceEndOffset;
852                             }) &&
853                "Could not find the current max split slice offset!");
854         assert(llvm::all_of(P.SplitTails,
855                             [&](Slice *S) {
856                               return S->endOffset() <= MaxSplitSliceEndOffset;
857                             }) &&
858                "Max split slice end offset is not actually the max!");
859       }
860     }
861 
862     // If P.SI is already at the end, then we've cleared the split tail and
863     // now have an end iterator.
864     if (P.SI == SE) {
865       assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866       return;
867     }
868 
869     // If we had a non-empty partition previously, set up the state for
870     // subsequent partitions.
871     if (P.SI != P.SJ) {
872       // Accumulate all the splittable slices which started in the old
873       // partition into the split list.
874       for (Slice &S : P)
875         if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876           P.SplitTails.push_back(&S);
877           MaxSplitSliceEndOffset =
878               std::max(S.endOffset(), MaxSplitSliceEndOffset);
879         }
880 
881       // Start from the end of the previous partition.
882       P.SI = P.SJ;
883 
884       // If P.SI is now at the end, we at most have a tail of split slices.
885       if (P.SI == SE) {
886         P.BeginOffset = P.EndOffset;
887         P.EndOffset = MaxSplitSliceEndOffset;
888         return;
889       }
890 
891       // If the we have split slices and the next slice is after a gap and is
892       // not splittable immediately form an empty partition for the split
893       // slices up until the next slice begins.
894       if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895           !P.SI->isSplittable()) {
896         P.BeginOffset = P.EndOffset;
897         P.EndOffset = P.SI->beginOffset();
898         return;
899       }
900     }
901 
902     // OK, we need to consume new slices. Set the end offset based on the
903     // current slice, and step SJ past it. The beginning offset of the
904     // partition is the beginning offset of the next slice unless we have
905     // pre-existing split slices that are continuing, in which case we begin
906     // at the prior end offset.
907     P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908     P.EndOffset = P.SI->endOffset();
909     ++P.SJ;
910 
911     // There are two strategies to form a partition based on whether the
912     // partition starts with an unsplittable slice or a splittable slice.
913     if (!P.SI->isSplittable()) {
914       // When we're forming an unsplittable region, it must always start at
915       // the first slice and will extend through its end.
916       assert(P.BeginOffset == P.SI->beginOffset());
917 
918       // Form a partition including all of the overlapping slices with this
919       // unsplittable slice.
920       while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921         if (!P.SJ->isSplittable())
922           P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
923         ++P.SJ;
924       }
925 
926       // We have a partition across a set of overlapping unsplittable
927       // partitions.
928       return;
929     }
930 
931     // If we're starting with a splittable slice, then we need to form
932     // a synthetic partition spanning it and any other overlapping splittable
933     // splices.
934     assert(P.SI->isSplittable() && "Forming a splittable partition!");
935 
936     // Collect all of the overlapping splittable slices.
937     while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938            P.SJ->isSplittable()) {
939       P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
940       ++P.SJ;
941     }
942 
943     // Back upiP.EndOffset if we ended the span early when encountering an
944     // unsplittable slice. This synthesizes the early end offset of
945     // a partition spanning only splittable slices.
946     if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947       assert(!P.SJ->isSplittable());
948       P.EndOffset = P.SJ->beginOffset();
949     }
950   }
951 
952 public:
953   bool operator==(const partition_iterator &RHS) const {
954     assert(SE == RHS.SE &&
955            "End iterators don't match between compared partition iterators!");
956 
957     // The observed positions of partitions is marked by the P.SI iterator and
958     // the emptiness of the split slices. The latter is only relevant when
959     // P.SI == SE, as the end iterator will additionally have an empty split
960     // slices list, but the prior may have the same P.SI and a tail of split
961     // slices.
962     if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963       assert(P.SJ == RHS.P.SJ &&
964              "Same set of slices formed two different sized partitions!");
965       assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966              "Same slice position with differently sized non-empty split "
967              "slice tails!");
968       return true;
969     }
970     return false;
971   }
972 
973   partition_iterator &operator++() {
974     advance();
975     return *this;
976   }
977 
978   Partition &operator*() { return P; }
979 };
980 
981 /// A forward range over the partitions of the alloca's slices.
982 ///
983 /// This accesses an iterator range over the partitions of the alloca's
984 /// slices. It computes these partitions on the fly based on the overlapping
985 /// offsets of the slices and the ability to split them. It will visit "empty"
986 /// partitions to cover regions of the alloca only accessed via split
987 /// slices.
988 iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989   return make_range(partition_iterator(begin(), end()),
990                     partition_iterator(end(), end()));
991 }
992 
993 static Value *foldSelectInst(SelectInst &SI) {
994   // If the condition being selected on is a constant or the same value is
995   // being selected between, fold the select. Yes this does (rarely) happen
996   // early on.
997   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
998     return SI.getOperand(1 + CI->isZero());
999   if (SI.getOperand(1) == SI.getOperand(2))
1000     return SI.getOperand(1);
1001 
1002   return nullptr;
1003 }
1004 
1005 /// A helper that folds a PHI node or a select.
1006 static Value *foldPHINodeOrSelectInst(Instruction &I) {
1007   if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1008     // If PN merges together the same value, return that value.
1009     return PN->hasConstantValue();
1010   }
1011   return foldSelectInst(cast<SelectInst>(I));
1012 }
1013 
1014 /// Builder for the alloca slices.
1015 ///
1016 /// This class builds a set of alloca slices by recursively visiting the uses
1017 /// of an alloca and making a slice for each load and store at each offset.
1018 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019   friend class PtrUseVisitor<SliceBuilder>;
1020   friend class InstVisitor<SliceBuilder>;
1021 
1022   using Base = PtrUseVisitor<SliceBuilder>;
1023 
1024   const uint64_t AllocSize;
1025   AllocaSlices &AS;
1026 
1027   SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1028   SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1029 
1030   /// Set to de-duplicate dead instructions found in the use walk.
1031   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1032 
1033 public:
1034   SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035       : PtrUseVisitor<SliceBuilder>(DL),
1036         AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1037         AS(AS) {}
1038 
1039 private:
1040   void markAsDead(Instruction &I) {
1041     if (VisitedDeadInsts.insert(&I).second)
1042       AS.DeadUsers.push_back(&I);
1043   }
1044 
1045   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046                  bool IsSplittable = false) {
1047     // Completely skip uses which have a zero size or start either before or
1048     // past the end of the allocation.
1049     if (Size == 0 || Offset.uge(AllocSize)) {
1050       LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051                         << Offset
1052                         << " which has zero size or starts outside of the "
1053                         << AllocSize << " byte alloca:\n"
1054                         << "    alloca: " << AS.AI << "\n"
1055                         << "       use: " << I << "\n");
1056       return markAsDead(I);
1057     }
1058 
1059     uint64_t BeginOffset = Offset.getZExtValue();
1060     uint64_t EndOffset = BeginOffset + Size;
1061 
1062     // Clamp the end offset to the end of the allocation. Note that this is
1063     // formulated to handle even the case where "BeginOffset + Size" overflows.
1064     // This may appear superficially to be something we could ignore entirely,
1065     // but that is not so! There may be widened loads or PHI-node uses where
1066     // some instructions are dead but not others. We can't completely ignore
1067     // them, and so have to record at least the information here.
1068     assert(AllocSize >= BeginOffset); // Established above.
1069     if (Size > AllocSize - BeginOffset) {
1070       LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071                         << Offset << " to remain within the " << AllocSize
1072                         << " byte alloca:\n"
1073                         << "    alloca: " << AS.AI << "\n"
1074                         << "       use: " << I << "\n");
1075       EndOffset = AllocSize;
1076     }
1077 
1078     AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1079   }
1080 
1081   void visitBitCastInst(BitCastInst &BC) {
1082     if (BC.use_empty())
1083       return markAsDead(BC);
1084 
1085     return Base::visitBitCastInst(BC);
1086   }
1087 
1088   void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089     if (ASC.use_empty())
1090       return markAsDead(ASC);
1091 
1092     return Base::visitAddrSpaceCastInst(ASC);
1093   }
1094 
1095   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096     if (GEPI.use_empty())
1097       return markAsDead(GEPI);
1098 
1099     return Base::visitGetElementPtrInst(GEPI);
1100   }
1101 
1102   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1103                          uint64_t Size, bool IsVolatile) {
1104     // We allow splitting of non-volatile loads and stores where the type is an
1105     // integer type. These may be used to implement 'memcpy' or other "transfer
1106     // of bits" patterns.
1107     bool IsSplittable =
1108         Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109 
1110     insertUse(I, Offset, Size, IsSplittable);
1111   }
1112 
1113   void visitLoadInst(LoadInst &LI) {
1114     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1115            "All simple FCA loads should have been pre-split");
1116 
1117     if (!IsOffsetKnown)
1118       return PI.setAborted(&LI);
1119 
1120     TypeSize Size = DL.getTypeStoreSize(LI.getType());
1121     if (Size.isScalable())
1122       return PI.setAborted(&LI);
1123 
1124     return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1125                              LI.isVolatile());
1126   }
1127 
1128   void visitStoreInst(StoreInst &SI) {
1129     Value *ValOp = SI.getValueOperand();
1130     if (ValOp == *U)
1131       return PI.setEscapedAndAborted(&SI);
1132     if (!IsOffsetKnown)
1133       return PI.setAborted(&SI);
1134 
1135     TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1136     if (StoreSize.isScalable())
1137       return PI.setAborted(&SI);
1138 
1139     uint64_t Size = StoreSize.getFixedValue();
1140 
1141     // If this memory access can be shown to *statically* extend outside the
1142     // bounds of the allocation, it's behavior is undefined, so simply
1143     // ignore it. Note that this is more strict than the generic clamping
1144     // behavior of insertUse. We also try to handle cases which might run the
1145     // risk of overflow.
1146     // FIXME: We should instead consider the pointer to have escaped if this
1147     // function is being instrumented for addressing bugs or race conditions.
1148     if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1149       LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1150                         << Offset << " which extends past the end of the "
1151                         << AllocSize << " byte alloca:\n"
1152                         << "    alloca: " << AS.AI << "\n"
1153                         << "       use: " << SI << "\n");
1154       return markAsDead(SI);
1155     }
1156 
1157     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1158            "All simple FCA stores should have been pre-split");
1159     handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1160   }
1161 
1162   void visitMemSetInst(MemSetInst &II) {
1163     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1164     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1165     if ((Length && Length->getValue() == 0) ||
1166         (IsOffsetKnown && Offset.uge(AllocSize)))
1167       // Zero-length mem transfer intrinsics can be ignored entirely.
1168       return markAsDead(II);
1169 
1170     if (!IsOffsetKnown)
1171       return PI.setAborted(&II);
1172 
1173     insertUse(II, Offset,
1174               Length ? Length->getLimitedValue()
1175                      : AllocSize - Offset.getLimitedValue(),
1176               (bool)Length);
1177   }
1178 
1179   void visitMemTransferInst(MemTransferInst &II) {
1180     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1181     if (Length && Length->getValue() == 0)
1182       // Zero-length mem transfer intrinsics can be ignored entirely.
1183       return markAsDead(II);
1184 
1185     // Because we can visit these intrinsics twice, also check to see if the
1186     // first time marked this instruction as dead. If so, skip it.
1187     if (VisitedDeadInsts.count(&II))
1188       return;
1189 
1190     if (!IsOffsetKnown)
1191       return PI.setAborted(&II);
1192 
1193     // This side of the transfer is completely out-of-bounds, and so we can
1194     // nuke the entire transfer. However, we also need to nuke the other side
1195     // if already added to our partitions.
1196     // FIXME: Yet another place we really should bypass this when
1197     // instrumenting for ASan.
1198     if (Offset.uge(AllocSize)) {
1199       SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1200           MemTransferSliceMap.find(&II);
1201       if (MTPI != MemTransferSliceMap.end())
1202         AS.Slices[MTPI->second].kill();
1203       return markAsDead(II);
1204     }
1205 
1206     uint64_t RawOffset = Offset.getLimitedValue();
1207     uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1208 
1209     // Check for the special case where the same exact value is used for both
1210     // source and dest.
1211     if (*U == II.getRawDest() && *U == II.getRawSource()) {
1212       // For non-volatile transfers this is a no-op.
1213       if (!II.isVolatile())
1214         return markAsDead(II);
1215 
1216       return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1217     }
1218 
1219     // If we have seen both source and destination for a mem transfer, then
1220     // they both point to the same alloca.
1221     bool Inserted;
1222     SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1223     std::tie(MTPI, Inserted) =
1224         MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1225     unsigned PrevIdx = MTPI->second;
1226     if (!Inserted) {
1227       Slice &PrevP = AS.Slices[PrevIdx];
1228 
1229       // Check if the begin offsets match and this is a non-volatile transfer.
1230       // In that case, we can completely elide the transfer.
1231       if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1232         PrevP.kill();
1233         return markAsDead(II);
1234       }
1235 
1236       // Otherwise we have an offset transfer within the same alloca. We can't
1237       // split those.
1238       PrevP.makeUnsplittable();
1239     }
1240 
1241     // Insert the use now that we've fixed up the splittable nature.
1242     insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1243 
1244     // Check that we ended up with a valid index in the map.
1245     assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1246            "Map index doesn't point back to a slice with this user.");
1247   }
1248 
1249   // Disable SRoA for any intrinsics except for lifetime invariants and
1250   // invariant group.
1251   // FIXME: What about debug intrinsics? This matches old behavior, but
1252   // doesn't make sense.
1253   void visitIntrinsicInst(IntrinsicInst &II) {
1254     if (II.isDroppable()) {
1255       AS.DeadUseIfPromotable.push_back(U);
1256       return;
1257     }
1258 
1259     if (!IsOffsetKnown)
1260       return PI.setAborted(&II);
1261 
1262     if (II.isLifetimeStartOrEnd()) {
1263       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
1264       uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
1265                                Length->getLimitedValue());
1266       insertUse(II, Offset, Size, true);
1267       return;
1268     }
1269 
1270     if (II.isLaunderOrStripInvariantGroup()) {
1271       insertUse(II, Offset, AllocSize, true);
1272       enqueueUsers(II);
1273       return;
1274     }
1275 
1276     Base::visitIntrinsicInst(II);
1277   }
1278 
1279   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1280     // We consider any PHI or select that results in a direct load or store of
1281     // the same offset to be a viable use for slicing purposes. These uses
1282     // are considered unsplittable and the size is the maximum loaded or stored
1283     // size.
1284     SmallPtrSet<Instruction *, 4> Visited;
1285     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1286     Visited.insert(Root);
1287     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1288     const DataLayout &DL = Root->getDataLayout();
1289     // If there are no loads or stores, the access is dead. We mark that as
1290     // a size zero access.
1291     Size = 0;
1292     do {
1293       Instruction *I, *UsedI;
1294       std::tie(UsedI, I) = Uses.pop_back_val();
1295 
1296       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1297         TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1298         if (LoadSize.isScalable()) {
1299           PI.setAborted(LI);
1300           return nullptr;
1301         }
1302         Size = std::max(Size, LoadSize.getFixedValue());
1303         continue;
1304       }
1305       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1306         Value *Op = SI->getOperand(0);
1307         if (Op == UsedI)
1308           return SI;
1309         TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1310         if (StoreSize.isScalable()) {
1311           PI.setAborted(SI);
1312           return nullptr;
1313         }
1314         Size = std::max(Size, StoreSize.getFixedValue());
1315         continue;
1316       }
1317 
1318       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1319         if (!GEP->hasAllZeroIndices())
1320           return GEP;
1321       } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1322                  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
1323         return I;
1324       }
1325 
1326       for (User *U : I->users())
1327         if (Visited.insert(cast<Instruction>(U)).second)
1328           Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1329     } while (!Uses.empty());
1330 
1331     return nullptr;
1332   }
1333 
1334   void visitPHINodeOrSelectInst(Instruction &I) {
1335     assert(isa<PHINode>(I) || isa<SelectInst>(I));
1336     if (I.use_empty())
1337       return markAsDead(I);
1338 
1339     // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1340     // instructions in this BB, which may be required during rewriting. Bail out
1341     // on these cases.
1342     if (isa<PHINode>(I) &&
1343         I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1344       return PI.setAborted(&I);
1345 
1346     // TODO: We could use simplifyInstruction here to fold PHINodes and
1347     // SelectInsts. However, doing so requires to change the current
1348     // dead-operand-tracking mechanism. For instance, suppose neither loading
1349     // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1350     // trap either.  However, if we simply replace %U with undef using the
1351     // current dead-operand-tracking mechanism, "load (select undef, undef,
1352     // %other)" may trap because the select may return the first operand
1353     // "undef".
1354     if (Value *Result = foldPHINodeOrSelectInst(I)) {
1355       if (Result == *U)
1356         // If the result of the constant fold will be the pointer, recurse
1357         // through the PHI/select as if we had RAUW'ed it.
1358         enqueueUsers(I);
1359       else
1360         // Otherwise the operand to the PHI/select is dead, and we can replace
1361         // it with poison.
1362         AS.DeadOperands.push_back(U);
1363 
1364       return;
1365     }
1366 
1367     if (!IsOffsetKnown)
1368       return PI.setAborted(&I);
1369 
1370     // See if we already have computed info on this node.
1371     uint64_t &Size = PHIOrSelectSizes[&I];
1372     if (!Size) {
1373       // This is a new PHI/Select, check for an unsafe use of it.
1374       if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1375         return PI.setAborted(UnsafeI);
1376     }
1377 
1378     // For PHI and select operands outside the alloca, we can't nuke the entire
1379     // phi or select -- the other side might still be relevant, so we special
1380     // case them here and use a separate structure to track the operands
1381     // themselves which should be replaced with poison.
1382     // FIXME: This should instead be escaped in the event we're instrumenting
1383     // for address sanitization.
1384     if (Offset.uge(AllocSize)) {
1385       AS.DeadOperands.push_back(U);
1386       return;
1387     }
1388 
1389     insertUse(I, Offset, Size);
1390   }
1391 
1392   void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1393 
1394   void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1395 
1396   /// Disable SROA entirely if there are unhandled users of the alloca.
1397   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1398 
1399   void visitCallBase(CallBase &CB) {
1400     // If the call operand is NoCapture ReadOnly, then we mark it as
1401     // EscapedReadOnly.
1402     if (CB.isDataOperand(U) &&
1403         CB.doesNotCapture(U->getOperandNo()) &&
1404         CB.onlyReadsMemory(U->getOperandNo())) {
1405       PI.setEscapedReadOnly(&CB);
1406       return;
1407     }
1408 
1409     Base::visitCallBase(CB);
1410   }
1411 };
1412 
1413 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1414     :
1415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1416       AI(AI),
1417 #endif
1418       PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1419   SliceBuilder PB(DL, AI, *this);
1420   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1421   if (PtrI.isEscaped() || PtrI.isAborted()) {
1422     // FIXME: We should sink the escape vs. abort info into the caller nicely,
1423     // possibly by just storing the PtrInfo in the AllocaSlices.
1424     PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1425                                                   : PtrI.getAbortingInst();
1426     assert(PointerEscapingInstr && "Did not track a bad instruction");
1427     return;
1428   }
1429   PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1430 
1431   llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1432 
1433   // Sort the uses. This arranges for the offsets to be in ascending order,
1434   // and the sizes to be in descending order.
1435   llvm::stable_sort(Slices);
1436 }
1437 
1438 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1439 
1440 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1441                          StringRef Indent) const {
1442   printSlice(OS, I, Indent);
1443   OS << "\n";
1444   printUse(OS, I, Indent);
1445 }
1446 
1447 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1448                               StringRef Indent) const {
1449   OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1450      << " slice #" << (I - begin())
1451      << (I->isSplittable() ? " (splittable)" : "");
1452 }
1453 
1454 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1455                             StringRef Indent) const {
1456   OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
1457 }
1458 
1459 void AllocaSlices::print(raw_ostream &OS) const {
1460   if (PointerEscapingInstr) {
1461     OS << "Can't analyze slices for alloca: " << AI << "\n"
1462        << "  A pointer to this alloca escaped by:\n"
1463        << "  " << *PointerEscapingInstr << "\n";
1464     return;
1465   }
1466 
1467   if (PointerEscapingInstrReadOnly)
1468     OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1469 
1470   OS << "Slices of alloca: " << AI << "\n";
1471   for (const_iterator I = begin(), E = end(); I != E; ++I)
1472     print(OS, I);
1473 }
1474 
1475 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1476   print(dbgs(), I);
1477 }
1478 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1479 
1480 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1481 
1482 /// Walk the range of a partitioning looking for a common type to cover this
1483 /// sequence of slices.
1484 static std::pair<Type *, IntegerType *>
1485 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1486                uint64_t EndOffset) {
1487   Type *Ty = nullptr;
1488   bool TyIsCommon = true;
1489   IntegerType *ITy = nullptr;
1490 
1491   // Note that we need to look at *every* alloca slice's Use to ensure we
1492   // always get consistent results regardless of the order of slices.
1493   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1494     Use *U = I->getUse();
1495     if (isa<IntrinsicInst>(*U->getUser()))
1496       continue;
1497     if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1498       continue;
1499 
1500     Type *UserTy = nullptr;
1501     if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1502       UserTy = LI->getType();
1503     } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1504       UserTy = SI->getValueOperand()->getType();
1505     }
1506 
1507     if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1508       // If the type is larger than the partition, skip it. We only encounter
1509       // this for split integer operations where we want to use the type of the
1510       // entity causing the split. Also skip if the type is not a byte width
1511       // multiple.
1512       if (UserITy->getBitWidth() % 8 != 0 ||
1513           UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1514         continue;
1515 
1516       // Track the largest bitwidth integer type used in this way in case there
1517       // is no common type.
1518       if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1519         ITy = UserITy;
1520     }
1521 
1522     // To avoid depending on the order of slices, Ty and TyIsCommon must not
1523     // depend on types skipped above.
1524     if (!UserTy || (Ty && Ty != UserTy))
1525       TyIsCommon = false; // Give up on anything but an iN type.
1526     else
1527       Ty = UserTy;
1528   }
1529 
1530   return {TyIsCommon ? Ty : nullptr, ITy};
1531 }
1532 
1533 /// PHI instructions that use an alloca and are subsequently loaded can be
1534 /// rewritten to load both input pointers in the pred blocks and then PHI the
1535 /// results, allowing the load of the alloca to be promoted.
1536 /// From this:
1537 ///   %P2 = phi [i32* %Alloca, i32* %Other]
1538 ///   %V = load i32* %P2
1539 /// to:
1540 ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1541 ///   ...
1542 ///   %V2 = load i32* %Other
1543 ///   ...
1544 ///   %V = phi [i32 %V1, i32 %V2]
1545 ///
1546 /// We can do this to a select if its only uses are loads and if the operands
1547 /// to the select can be loaded unconditionally.
1548 ///
1549 /// FIXME: This should be hoisted into a generic utility, likely in
1550 /// Transforms/Util/Local.h
1551 static bool isSafePHIToSpeculate(PHINode &PN) {
1552   const DataLayout &DL = PN.getDataLayout();
1553 
1554   // For now, we can only do this promotion if the load is in the same block
1555   // as the PHI, and if there are no stores between the phi and load.
1556   // TODO: Allow recursive phi users.
1557   // TODO: Allow stores.
1558   BasicBlock *BB = PN.getParent();
1559   Align MaxAlign;
1560   uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1561   Type *LoadType = nullptr;
1562   for (User *U : PN.users()) {
1563     LoadInst *LI = dyn_cast<LoadInst>(U);
1564     if (!LI || !LI->isSimple())
1565       return false;
1566 
1567     // For now we only allow loads in the same block as the PHI.  This is
1568     // a common case that happens when instcombine merges two loads through
1569     // a PHI.
1570     if (LI->getParent() != BB)
1571       return false;
1572 
1573     if (LoadType) {
1574       if (LoadType != LI->getType())
1575         return false;
1576     } else {
1577       LoadType = LI->getType();
1578     }
1579 
1580     // Ensure that there are no instructions between the PHI and the load that
1581     // could store.
1582     for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1583       if (BBI->mayWriteToMemory())
1584         return false;
1585 
1586     MaxAlign = std::max(MaxAlign, LI->getAlign());
1587   }
1588 
1589   if (!LoadType)
1590     return false;
1591 
1592   APInt LoadSize =
1593       APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1594 
1595   // We can only transform this if it is safe to push the loads into the
1596   // predecessor blocks. The only thing to watch out for is that we can't put
1597   // a possibly trapping load in the predecessor if it is a critical edge.
1598   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1599     Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1600     Value *InVal = PN.getIncomingValue(Idx);
1601 
1602     // If the value is produced by the terminator of the predecessor (an
1603     // invoke) or it has side-effects, there is no valid place to put a load
1604     // in the predecessor.
1605     if (TI == InVal || TI->mayHaveSideEffects())
1606       return false;
1607 
1608     // If the predecessor has a single successor, then the edge isn't
1609     // critical.
1610     if (TI->getNumSuccessors() == 1)
1611       continue;
1612 
1613     // If this pointer is always safe to load, or if we can prove that there
1614     // is already a load in the block, then we can move the load to the pred
1615     // block.
1616     if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1617       continue;
1618 
1619     return false;
1620   }
1621 
1622   return true;
1623 }
1624 
1625 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1626   LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
1627 
1628   LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1629   Type *LoadTy = SomeLoad->getType();
1630   IRB.SetInsertPoint(&PN);
1631   PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1632                                  PN.getName() + ".sroa.speculated");
1633 
1634   // Get the AA tags and alignment to use from one of the loads. It does not
1635   // matter which one we get and if any differ.
1636   AAMDNodes AATags = SomeLoad->getAAMetadata();
1637   Align Alignment = SomeLoad->getAlign();
1638 
1639   // Rewrite all loads of the PN to use the new PHI.
1640   while (!PN.use_empty()) {
1641     LoadInst *LI = cast<LoadInst>(PN.user_back());
1642     LI->replaceAllUsesWith(NewPN);
1643     LI->eraseFromParent();
1644   }
1645 
1646   // Inject loads into all of the pred blocks.
1647   DenseMap<BasicBlock *, Value *> InjectedLoads;
1648   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1649     BasicBlock *Pred = PN.getIncomingBlock(Idx);
1650     Value *InVal = PN.getIncomingValue(Idx);
1651 
1652     // A PHI node is allowed to have multiple (duplicated) entries for the same
1653     // basic block, as long as the value is the same. So if we already injected
1654     // a load in the predecessor, then we should reuse the same load for all
1655     // duplicated entries.
1656     if (Value *V = InjectedLoads.lookup(Pred)) {
1657       NewPN->addIncoming(V, Pred);
1658       continue;
1659     }
1660 
1661     Instruction *TI = Pred->getTerminator();
1662     IRB.SetInsertPoint(TI);
1663 
1664     LoadInst *Load = IRB.CreateAlignedLoad(
1665         LoadTy, InVal, Alignment,
1666         (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1667     ++NumLoadsSpeculated;
1668     if (AATags)
1669       Load->setAAMetadata(AATags);
1670     NewPN->addIncoming(Load, Pred);
1671     InjectedLoads[Pred] = Load;
1672   }
1673 
1674   LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
1675   PN.eraseFromParent();
1676 }
1677 
1678 SelectHandSpeculativity &
1679 SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1680   if (isTrueVal)
1681     Bitfield::set<SelectHandSpeculativity::TrueVal>(Storage, true);
1682   else
1683     Bitfield::set<SelectHandSpeculativity::FalseVal>(Storage, true);
1684   return *this;
1685 }
1686 
1687 bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1688   return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1689                    : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1690 }
1691 
1692 bool SelectHandSpeculativity::areAllSpeculatable() const {
1693   return isSpeculatable(/*isTrueVal=*/true) &&
1694          isSpeculatable(/*isTrueVal=*/false);
1695 }
1696 
1697 bool SelectHandSpeculativity::areAnySpeculatable() const {
1698   return isSpeculatable(/*isTrueVal=*/true) ||
1699          isSpeculatable(/*isTrueVal=*/false);
1700 }
1701 bool SelectHandSpeculativity::areNoneSpeculatable() const {
1702   return !areAnySpeculatable();
1703 }
1704 
1705 static SelectHandSpeculativity
1706 isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1707   assert(LI.isSimple() && "Only for simple loads");
1708   SelectHandSpeculativity Spec;
1709 
1710   const DataLayout &DL = SI.getDataLayout();
1711   for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1712     if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL,
1713                                     &LI))
1714       Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1715     else if (PreserveCFG)
1716       return Spec;
1717 
1718   return Spec;
1719 }
1720 
1721 std::optional<RewriteableMemOps>
1722 SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1723   RewriteableMemOps Ops;
1724 
1725   for (User *U : SI.users()) {
1726     if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1727       U = *BC->user_begin();
1728 
1729     if (auto *Store = dyn_cast<StoreInst>(U)) {
1730       // Note that atomic stores can be transformed; atomic semantics do not
1731       // have any meaning for a local alloca. Stores are not speculatable,
1732       // however, so if we can't turn it into a predicated store, we are done.
1733       if (Store->isVolatile() || PreserveCFG)
1734         return {}; // Give up on this `select`.
1735       Ops.emplace_back(Store);
1736       continue;
1737     }
1738 
1739     auto *LI = dyn_cast<LoadInst>(U);
1740 
1741     // Note that atomic loads can be transformed;
1742     // atomic semantics do not have any meaning for a local alloca.
1743     if (!LI || LI->isVolatile())
1744       return {}; // Give up on this `select`.
1745 
1746     PossiblySpeculatableLoad Load(LI);
1747     if (!LI->isSimple()) {
1748       // If the `load` is not simple, we can't speculatively execute it,
1749       // but we could handle this via a CFG modification. But can we?
1750       if (PreserveCFG)
1751         return {}; // Give up on this `select`.
1752       Ops.emplace_back(Load);
1753       continue;
1754     }
1755 
1756     SelectHandSpeculativity Spec =
1757         isSafeLoadOfSelectToSpeculate(*LI, SI, PreserveCFG);
1758     if (PreserveCFG && !Spec.areAllSpeculatable())
1759       return {}; // Give up on this `select`.
1760 
1761     Load.setInt(Spec);
1762     Ops.emplace_back(Load);
1763   }
1764 
1765   return Ops;
1766 }
1767 
1768 static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1769                                      IRBuilderTy &IRB) {
1770   LLVM_DEBUG(dbgs() << "    original load: " << SI << "\n");
1771 
1772   Value *TV = SI.getTrueValue();
1773   Value *FV = SI.getFalseValue();
1774   // Replace the given load of the select with a select of two loads.
1775 
1776   assert(LI.isSimple() && "We only speculate simple loads");
1777 
1778   IRB.SetInsertPoint(&LI);
1779 
1780   LoadInst *TL =
1781       IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1782                             LI.getName() + ".sroa.speculate.load.true");
1783   LoadInst *FL =
1784       IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1785                             LI.getName() + ".sroa.speculate.load.false");
1786   NumLoadsSpeculated += 2;
1787 
1788   // Transfer alignment and AA info if present.
1789   TL->setAlignment(LI.getAlign());
1790   FL->setAlignment(LI.getAlign());
1791 
1792   AAMDNodes Tags = LI.getAAMetadata();
1793   if (Tags) {
1794     TL->setAAMetadata(Tags);
1795     FL->setAAMetadata(Tags);
1796   }
1797 
1798   Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1799                               LI.getName() + ".sroa.speculated");
1800 
1801   LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
1802   LI.replaceAllUsesWith(V);
1803 }
1804 
1805 template <typename T>
1806 static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1807                                  SelectHandSpeculativity Spec,
1808                                  DomTreeUpdater &DTU) {
1809   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1810   LLVM_DEBUG(dbgs() << "    original mem op: " << I << "\n");
1811   BasicBlock *Head = I.getParent();
1812   Instruction *ThenTerm = nullptr;
1813   Instruction *ElseTerm = nullptr;
1814   if (Spec.areNoneSpeculatable())
1815     SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1816                                   SI.getMetadata(LLVMContext::MD_prof), &DTU);
1817   else {
1818     SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1819                               SI.getMetadata(LLVMContext::MD_prof), &DTU,
1820                               /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1821     if (Spec.isSpeculatable(/*isTrueVal=*/true))
1822       cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1823   }
1824   auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1825   Spec = {}; // Do not use `Spec` beyond this point.
1826   BasicBlock *Tail = I.getParent();
1827   Tail->setName(Head->getName() + ".cont");
1828   PHINode *PN;
1829   if (isa<LoadInst>(I))
1830     PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1831   for (BasicBlock *SuccBB : successors(Head)) {
1832     bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1833     int SuccIdx = IsThen ? 0 : 1;
1834     auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1835     auto &CondMemOp = cast<T>(*I.clone());
1836     if (NewMemOpBB != Head) {
1837       NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1838       if (isa<LoadInst>(I))
1839         ++NumLoadsPredicated;
1840       else
1841         ++NumStoresPredicated;
1842     } else {
1843       CondMemOp.dropUBImplyingAttrsAndMetadata();
1844       ++NumLoadsSpeculated;
1845     }
1846     CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1847     Value *Ptr = SI.getOperand(1 + SuccIdx);
1848     CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1849     if (isa<LoadInst>(I)) {
1850       CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1851       PN->addIncoming(&CondMemOp, NewMemOpBB);
1852     } else
1853       LLVM_DEBUG(dbgs() << "                 to: " << CondMemOp << "\n");
1854   }
1855   if (isa<LoadInst>(I)) {
1856     PN->takeName(&I);
1857     LLVM_DEBUG(dbgs() << "          to: " << *PN << "\n");
1858     I.replaceAllUsesWith(PN);
1859   }
1860 }
1861 
1862 static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1863                                  SelectHandSpeculativity Spec,
1864                                  DomTreeUpdater &DTU) {
1865   if (auto *LI = dyn_cast<LoadInst>(&I))
1866     rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1867   else if (auto *SI = dyn_cast<StoreInst>(&I))
1868     rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1869   else
1870     llvm_unreachable_internal("Only for load and store.");
1871 }
1872 
1873 static bool rewriteSelectInstMemOps(SelectInst &SI,
1874                                     const RewriteableMemOps &Ops,
1875                                     IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1876   bool CFGChanged = false;
1877   LLVM_DEBUG(dbgs() << "    original select: " << SI << "\n");
1878 
1879   for (const RewriteableMemOp &Op : Ops) {
1880     SelectHandSpeculativity Spec;
1881     Instruction *I;
1882     if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1883       I = *US;
1884     } else {
1885       auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1886       I = PSL.getPointer();
1887       Spec = PSL.getInt();
1888     }
1889     if (Spec.areAllSpeculatable()) {
1890       speculateSelectInstLoads(SI, cast<LoadInst>(*I), IRB);
1891     } else {
1892       assert(DTU && "Should not get here when not allowed to modify the CFG!");
1893       rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1894       CFGChanged = true;
1895     }
1896     I->eraseFromParent();
1897   }
1898 
1899   for (User *U : make_early_inc_range(SI.users()))
1900     cast<BitCastInst>(U)->eraseFromParent();
1901   SI.eraseFromParent();
1902   return CFGChanged;
1903 }
1904 
1905 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1906 /// resulting pointer has PointerTy.
1907 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1908                              APInt Offset, Type *PointerTy,
1909                              const Twine &NamePrefix) {
1910   if (Offset != 0)
1911     Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1912                                    NamePrefix + "sroa_idx");
1913   return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1914                                                  NamePrefix + "sroa_cast");
1915 }
1916 
1917 /// Compute the adjusted alignment for a load or store from an offset.
1918 static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1919   return commonAlignment(getLoadStoreAlignment(I), Offset);
1920 }
1921 
1922 /// Test whether we can convert a value from the old to the new type.
1923 ///
1924 /// This predicate should be used to guard calls to convertValue in order to
1925 /// ensure that we only try to convert viable values. The strategy is that we
1926 /// will peel off single element struct and array wrappings to get to an
1927 /// underlying value, and convert that value.
1928 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1929   if (OldTy == NewTy)
1930     return true;
1931 
1932   // For integer types, we can't handle any bit-width differences. This would
1933   // break both vector conversions with extension and introduce endianness
1934   // issues when in conjunction with loads and stores.
1935   if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1936     assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1937                cast<IntegerType>(NewTy)->getBitWidth() &&
1938            "We can't have the same bitwidth for different int types");
1939     return false;
1940   }
1941 
1942   if (DL.getTypeSizeInBits(NewTy).getFixedValue() !=
1943       DL.getTypeSizeInBits(OldTy).getFixedValue())
1944     return false;
1945   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1946     return false;
1947 
1948   // We can convert pointers to integers and vice-versa. Same for vectors
1949   // of pointers and integers.
1950   OldTy = OldTy->getScalarType();
1951   NewTy = NewTy->getScalarType();
1952   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1953     if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1954       unsigned OldAS = OldTy->getPointerAddressSpace();
1955       unsigned NewAS = NewTy->getPointerAddressSpace();
1956       // Convert pointers if they are pointers from the same address space or
1957       // different integral (not non-integral) address spaces with the same
1958       // pointer size.
1959       return OldAS == NewAS ||
1960              (!DL.isNonIntegralAddressSpace(OldAS) &&
1961               !DL.isNonIntegralAddressSpace(NewAS) &&
1962               DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1963     }
1964 
1965     // We can convert integers to integral pointers, but not to non-integral
1966     // pointers.
1967     if (OldTy->isIntegerTy())
1968       return !DL.isNonIntegralPointerType(NewTy);
1969 
1970     // We can convert integral pointers to integers, but non-integral pointers
1971     // need to remain pointers.
1972     if (!DL.isNonIntegralPointerType(OldTy))
1973       return NewTy->isIntegerTy();
1974 
1975     return false;
1976   }
1977 
1978   if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
1979     return false;
1980 
1981   return true;
1982 }
1983 
1984 /// Generic routine to convert an SSA value to a value of a different
1985 /// type.
1986 ///
1987 /// This will try various different casting techniques, such as bitcasts,
1988 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1989 /// two types for viability with this routine.
1990 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1991                            Type *NewTy) {
1992   Type *OldTy = V->getType();
1993   assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1994 
1995   if (OldTy == NewTy)
1996     return V;
1997 
1998   assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1999          "Integer types must be the exact same to convert.");
2000 
2001   // See if we need inttoptr for this type pair. May require additional bitcast.
2002   if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2003     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2004     // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2005     // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2006     // Directly handle i64 to i8*
2007     return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
2008                               NewTy);
2009   }
2010 
2011   // See if we need ptrtoint for this type pair. May require additional bitcast.
2012   if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2013     // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2014     // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2015     // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2016     // Expand i8* to i64 --> i8* to i64 to i64
2017     return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2018                              NewTy);
2019   }
2020 
2021   if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2022     unsigned OldAS = OldTy->getPointerAddressSpace();
2023     unsigned NewAS = NewTy->getPointerAddressSpace();
2024     // To convert pointers with different address spaces (they are already
2025     // checked convertible, i.e. they have the same pointer size), so far we
2026     // cannot use `bitcast` (which has restrict on the same address space) or
2027     // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2028     // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2029     // size.
2030     if (OldAS != NewAS) {
2031       assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2032       return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2033                                 NewTy);
2034     }
2035   }
2036 
2037   return IRB.CreateBitCast(V, NewTy);
2038 }
2039 
2040 /// Test whether the given slice use can be promoted to a vector.
2041 ///
2042 /// This function is called to test each entry in a partition which is slated
2043 /// for a single slice.
2044 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2045                                             VectorType *Ty,
2046                                             uint64_t ElementSize,
2047                                             const DataLayout &DL) {
2048   // First validate the slice offsets.
2049   uint64_t BeginOffset =
2050       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2051   uint64_t BeginIndex = BeginOffset / ElementSize;
2052   if (BeginIndex * ElementSize != BeginOffset ||
2053       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2054     return false;
2055   uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2056   uint64_t EndIndex = EndOffset / ElementSize;
2057   if (EndIndex * ElementSize != EndOffset ||
2058       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2059     return false;
2060 
2061   assert(EndIndex > BeginIndex && "Empty vector!");
2062   uint64_t NumElements = EndIndex - BeginIndex;
2063   Type *SliceTy = (NumElements == 1)
2064                       ? Ty->getElementType()
2065                       : FixedVectorType::get(Ty->getElementType(), NumElements);
2066 
2067   Type *SplitIntTy =
2068       Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2069 
2070   Use *U = S.getUse();
2071 
2072   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2073     if (MI->isVolatile())
2074       return false;
2075     if (!S.isSplittable())
2076       return false; // Skip any unsplittable intrinsics.
2077   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2078     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2079       return false;
2080   } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2081     if (LI->isVolatile())
2082       return false;
2083     Type *LTy = LI->getType();
2084     // Disable vector promotion when there are loads or stores of an FCA.
2085     if (LTy->isStructTy())
2086       return false;
2087     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2088       assert(LTy->isIntegerTy());
2089       LTy = SplitIntTy;
2090     }
2091     if (!canConvertValue(DL, SliceTy, LTy))
2092       return false;
2093   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2094     if (SI->isVolatile())
2095       return false;
2096     Type *STy = SI->getValueOperand()->getType();
2097     // Disable vector promotion when there are loads or stores of an FCA.
2098     if (STy->isStructTy())
2099       return false;
2100     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2101       assert(STy->isIntegerTy());
2102       STy = SplitIntTy;
2103     }
2104     if (!canConvertValue(DL, STy, SliceTy))
2105       return false;
2106   } else {
2107     return false;
2108   }
2109 
2110   return true;
2111 }
2112 
2113 /// Test whether a vector type is viable for promotion.
2114 ///
2115 /// This implements the necessary checking for \c checkVectorTypesForPromotion
2116 /// (and thus isVectorPromotionViable) over all slices of the alloca for the
2117 /// given VectorType.
2118 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2119                                         const DataLayout &DL) {
2120   uint64_t ElementSize =
2121       DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2122 
2123   // While the definition of LLVM vectors is bitpacked, we don't support sizes
2124   // that aren't byte sized.
2125   if (ElementSize % 8)
2126     return false;
2127   assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2128          "vector size not a multiple of element size?");
2129   ElementSize /= 8;
2130 
2131   for (const Slice &S : P)
2132     if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
2133       return false;
2134 
2135   for (const Slice *S : P.splitSliceTails())
2136     if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
2137       return false;
2138 
2139   return true;
2140 }
2141 
2142 /// Test whether any vector type in \p CandidateTys is viable for promotion.
2143 ///
2144 /// This implements the necessary checking for \c isVectorPromotionViable over
2145 /// all slices of the alloca for the given VectorType.
2146 static VectorType *
2147 checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2148                              SmallVectorImpl<VectorType *> &CandidateTys,
2149                              bool HaveCommonEltTy, Type *CommonEltTy,
2150                              bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2151                              VectorType *CommonVecPtrTy) {
2152   // If we didn't find a vector type, nothing to do here.
2153   if (CandidateTys.empty())
2154     return nullptr;
2155 
2156   // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2157   // then we should choose it, not some other alternative.
2158   // But, we can't perform a no-op pointer address space change via bitcast,
2159   // so if we didn't have a common pointer element type, bail.
2160   if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2161     return nullptr;
2162 
2163   // Try to pick the "best" element type out of the choices.
2164   if (!HaveCommonEltTy && HaveVecPtrTy) {
2165     // If there was a pointer element type, there's really only one choice.
2166     CandidateTys.clear();
2167     CandidateTys.push_back(CommonVecPtrTy);
2168   } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2169     // Integer-ify vector types.
2170     for (VectorType *&VTy : CandidateTys) {
2171       if (!VTy->getElementType()->isIntegerTy())
2172         VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2173             VTy->getContext(), VTy->getScalarSizeInBits())));
2174     }
2175 
2176     // Rank the remaining candidate vector types. This is easy because we know
2177     // they're all integer vectors. We sort by ascending number of elements.
2178     auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2179       (void)DL;
2180       assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2181                  DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2182              "Cannot have vector types of different sizes!");
2183       assert(RHSTy->getElementType()->isIntegerTy() &&
2184              "All non-integer types eliminated!");
2185       assert(LHSTy->getElementType()->isIntegerTy() &&
2186              "All non-integer types eliminated!");
2187       return cast<FixedVectorType>(RHSTy)->getNumElements() <
2188              cast<FixedVectorType>(LHSTy)->getNumElements();
2189     };
2190     auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2191       (void)DL;
2192       assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2193                  DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2194              "Cannot have vector types of different sizes!");
2195       assert(RHSTy->getElementType()->isIntegerTy() &&
2196              "All non-integer types eliminated!");
2197       assert(LHSTy->getElementType()->isIntegerTy() &&
2198              "All non-integer types eliminated!");
2199       return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2200              cast<FixedVectorType>(LHSTy)->getNumElements();
2201     };
2202     llvm::sort(CandidateTys, RankVectorTypesComp);
2203     CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2204                        CandidateTys.end());
2205   } else {
2206 // The only way to have the same element type in every vector type is to
2207 // have the same vector type. Check that and remove all but one.
2208 #ifndef NDEBUG
2209     for (VectorType *VTy : CandidateTys) {
2210       assert(VTy->getElementType() == CommonEltTy &&
2211              "Unaccounted for element type!");
2212       assert(VTy == CandidateTys[0] &&
2213              "Different vector types with the same element type!");
2214     }
2215 #endif
2216     CandidateTys.resize(1);
2217   }
2218 
2219   // FIXME: hack. Do we have a named constant for this?
2220   // SDAG SDNode can't have more than 65535 operands.
2221   llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2222     return cast<FixedVectorType>(VTy)->getNumElements() >
2223            std::numeric_limits<unsigned short>::max();
2224   });
2225 
2226   for (VectorType *VTy : CandidateTys)
2227     if (checkVectorTypeForPromotion(P, VTy, DL))
2228       return VTy;
2229 
2230   return nullptr;
2231 }
2232 
2233 static VectorType *createAndCheckVectorTypesForPromotion(
2234     SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2235     function_ref<void(Type *)> CheckCandidateType, Partition &P,
2236     const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2237     bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2238     bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) {
2239   [[maybe_unused]] VectorType *OriginalElt =
2240       CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2241   // Consider additional vector types where the element type size is a
2242   // multiple of load/store element size.
2243   for (Type *Ty : OtherTys) {
2244     if (!VectorType::isValidElementType(Ty))
2245       continue;
2246     unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2247     // Make a copy of CandidateTys and iterate through it, because we
2248     // might append to CandidateTys in the loop.
2249     for (VectorType *const VTy : CandidateTysCopy) {
2250       // The elements in the copy should remain invariant throughout the loop
2251       assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2252       unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2253       unsigned ElementSize =
2254           DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2255       if (TypeSize != VectorSize && TypeSize != ElementSize &&
2256           VectorSize % TypeSize == 0) {
2257         VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2258         CheckCandidateType(NewVTy);
2259       }
2260     }
2261   }
2262 
2263   return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
2264                                       CommonEltTy, HaveVecPtrTy,
2265                                       HaveCommonVecPtrTy, CommonVecPtrTy);
2266 }
2267 
2268 /// Test whether the given alloca partitioning and range of slices can be
2269 /// promoted to a vector.
2270 ///
2271 /// This is a quick test to check whether we can rewrite a particular alloca
2272 /// partition (and its newly formed alloca) into a vector alloca with only
2273 /// whole-vector loads and stores such that it could be promoted to a vector
2274 /// SSA value. We only can ensure this for a limited set of operations, and we
2275 /// don't want to do the rewrites unless we are confident that the result will
2276 /// be promotable, so we have an early test here.
2277 static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
2278   // Collect the candidate types for vector-based promotion. Also track whether
2279   // we have different element types.
2280   SmallVector<VectorType *, 4> CandidateTys;
2281   SetVector<Type *> LoadStoreTys;
2282   SetVector<Type *> DeferredTys;
2283   Type *CommonEltTy = nullptr;
2284   VectorType *CommonVecPtrTy = nullptr;
2285   bool HaveVecPtrTy = false;
2286   bool HaveCommonEltTy = true;
2287   bool HaveCommonVecPtrTy = true;
2288   auto CheckCandidateType = [&](Type *Ty) {
2289     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2290       // Return if bitcast to vectors is different for total size in bits.
2291       if (!CandidateTys.empty()) {
2292         VectorType *V = CandidateTys[0];
2293         if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2294             DL.getTypeSizeInBits(V).getFixedValue()) {
2295           CandidateTys.clear();
2296           return;
2297         }
2298       }
2299       CandidateTys.push_back(VTy);
2300       Type *EltTy = VTy->getElementType();
2301 
2302       if (!CommonEltTy)
2303         CommonEltTy = EltTy;
2304       else if (CommonEltTy != EltTy)
2305         HaveCommonEltTy = false;
2306 
2307       if (EltTy->isPointerTy()) {
2308         HaveVecPtrTy = true;
2309         if (!CommonVecPtrTy)
2310           CommonVecPtrTy = VTy;
2311         else if (CommonVecPtrTy != VTy)
2312           HaveCommonVecPtrTy = false;
2313       }
2314     }
2315   };
2316 
2317   // Put load and store types into a set for de-duplication.
2318   for (const Slice &S : P) {
2319     Type *Ty;
2320     if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2321       Ty = LI->getType();
2322     else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2323       Ty = SI->getValueOperand()->getType();
2324     else
2325       continue;
2326 
2327     auto CandTy = Ty->getScalarType();
2328     if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2329                                   S.endOffset() != P.endOffset())) {
2330       DeferredTys.insert(Ty);
2331       continue;
2332     }
2333 
2334     LoadStoreTys.insert(Ty);
2335     // Consider any loads or stores that are the exact size of the slice.
2336     if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2337       CheckCandidateType(Ty);
2338   }
2339 
2340   SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2341   if (auto *VTy = createAndCheckVectorTypesForPromotion(
2342           LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2343           CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2344           HaveCommonVecPtrTy, CommonVecPtrTy))
2345     return VTy;
2346 
2347   CandidateTys.clear();
2348   return createAndCheckVectorTypesForPromotion(
2349       DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2350       HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2351       CommonVecPtrTy);
2352 }
2353 
2354 /// Test whether a slice of an alloca is valid for integer widening.
2355 ///
2356 /// This implements the necessary checking for the \c isIntegerWideningViable
2357 /// test below on a single slice of the alloca.
2358 static bool isIntegerWideningViableForSlice(const Slice &S,
2359                                             uint64_t AllocBeginOffset,
2360                                             Type *AllocaTy,
2361                                             const DataLayout &DL,
2362                                             bool &WholeAllocaOp) {
2363   uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2364 
2365   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2366   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2367 
2368   Use *U = S.getUse();
2369 
2370   // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2371   // larger than other load/store slices (RelEnd > Size). But lifetime are
2372   // always promotable and should not impact other slices' promotability of the
2373   // partition.
2374   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2375     if (II->isLifetimeStartOrEnd() || II->isDroppable())
2376       return true;
2377   }
2378 
2379   // We can't reasonably handle cases where the load or store extends past
2380   // the end of the alloca's type and into its padding.
2381   if (RelEnd > Size)
2382     return false;
2383 
2384   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2385     if (LI->isVolatile())
2386       return false;
2387     // We can't handle loads that extend past the allocated memory.
2388     if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size)
2389       return false;
2390     // So far, AllocaSliceRewriter does not support widening split slice tails
2391     // in rewriteIntegerLoad.
2392     if (S.beginOffset() < AllocBeginOffset)
2393       return false;
2394     // Note that we don't count vector loads or stores as whole-alloca
2395     // operations which enable integer widening because we would prefer to use
2396     // vector widening instead.
2397     if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2398       WholeAllocaOp = true;
2399     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2400       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2401         return false;
2402     } else if (RelBegin != 0 || RelEnd != Size ||
2403                !canConvertValue(DL, AllocaTy, LI->getType())) {
2404       // Non-integer loads need to be convertible from the alloca type so that
2405       // they are promotable.
2406       return false;
2407     }
2408   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2409     Type *ValueTy = SI->getValueOperand()->getType();
2410     if (SI->isVolatile())
2411       return false;
2412     // We can't handle stores that extend past the allocated memory.
2413     if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size)
2414       return false;
2415     // So far, AllocaSliceRewriter does not support widening split slice tails
2416     // in rewriteIntegerStore.
2417     if (S.beginOffset() < AllocBeginOffset)
2418       return false;
2419     // Note that we don't count vector loads or stores as whole-alloca
2420     // operations which enable integer widening because we would prefer to use
2421     // vector widening instead.
2422     if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2423       WholeAllocaOp = true;
2424     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2425       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2426         return false;
2427     } else if (RelBegin != 0 || RelEnd != Size ||
2428                !canConvertValue(DL, ValueTy, AllocaTy)) {
2429       // Non-integer stores need to be convertible to the alloca type so that
2430       // they are promotable.
2431       return false;
2432     }
2433   } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2434     if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2435       return false;
2436     if (!S.isSplittable())
2437       return false; // Skip any unsplittable intrinsics.
2438   } else {
2439     return false;
2440   }
2441 
2442   return true;
2443 }
2444 
2445 /// Test whether the given alloca partition's integer operations can be
2446 /// widened to promotable ones.
2447 ///
2448 /// This is a quick test to check whether we can rewrite the integer loads and
2449 /// stores to a particular alloca into wider loads and stores and be able to
2450 /// promote the resulting alloca.
2451 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2452                                     const DataLayout &DL) {
2453   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2454   // Don't create integer types larger than the maximum bitwidth.
2455   if (SizeInBits > IntegerType::MAX_INT_BITS)
2456     return false;
2457 
2458   // Don't try to handle allocas with bit-padding.
2459   if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2460     return false;
2461 
2462   // We need to ensure that an integer type with the appropriate bitwidth can
2463   // be converted to the alloca type, whatever that is. We don't want to force
2464   // the alloca itself to have an integer type if there is a more suitable one.
2465   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2466   if (!canConvertValue(DL, AllocaTy, IntTy) ||
2467       !canConvertValue(DL, IntTy, AllocaTy))
2468     return false;
2469 
2470   // While examining uses, we ensure that the alloca has a covering load or
2471   // store. We don't want to widen the integer operations only to fail to
2472   // promote due to some other unsplittable entry (which we may make splittable
2473   // later). However, if there are only splittable uses, go ahead and assume
2474   // that we cover the alloca.
2475   // FIXME: We shouldn't consider split slices that happen to start in the
2476   // partition here...
2477   bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2478 
2479   for (const Slice &S : P)
2480     if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2481                                          WholeAllocaOp))
2482       return false;
2483 
2484   for (const Slice *S : P.splitSliceTails())
2485     if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2486                                          WholeAllocaOp))
2487       return false;
2488 
2489   return WholeAllocaOp;
2490 }
2491 
2492 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2493                              IntegerType *Ty, uint64_t Offset,
2494                              const Twine &Name) {
2495   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
2496   IntegerType *IntTy = cast<IntegerType>(V->getType());
2497   assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2498              DL.getTypeStoreSize(IntTy).getFixedValue() &&
2499          "Element extends past full value");
2500   uint64_t ShAmt = 8 * Offset;
2501   if (DL.isBigEndian())
2502     ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2503                  DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2504   if (ShAmt) {
2505     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2506     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
2507   }
2508   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2509          "Cannot extract to a larger integer!");
2510   if (Ty != IntTy) {
2511     V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2512     LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n");
2513   }
2514   return V;
2515 }
2516 
2517 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2518                             Value *V, uint64_t Offset, const Twine &Name) {
2519   IntegerType *IntTy = cast<IntegerType>(Old->getType());
2520   IntegerType *Ty = cast<IntegerType>(V->getType());
2521   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2522          "Cannot insert a larger integer!");
2523   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
2524   if (Ty != IntTy) {
2525     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2526     LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
2527   }
2528   assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2529              DL.getTypeStoreSize(IntTy).getFixedValue() &&
2530          "Element store outside of alloca store");
2531   uint64_t ShAmt = 8 * Offset;
2532   if (DL.isBigEndian())
2533     ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2534                  DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2535   if (ShAmt) {
2536     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2537     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
2538   }
2539 
2540   if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2541     APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2542     Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2543     LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n");
2544     V = IRB.CreateOr(Old, V, Name + ".insert");
2545     LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n");
2546   }
2547   return V;
2548 }
2549 
2550 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2551                             unsigned EndIndex, const Twine &Name) {
2552   auto *VecTy = cast<FixedVectorType>(V->getType());
2553   unsigned NumElements = EndIndex - BeginIndex;
2554   assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2555 
2556   if (NumElements == VecTy->getNumElements())
2557     return V;
2558 
2559   if (NumElements == 1) {
2560     V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2561                                  Name + ".extract");
2562     LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n");
2563     return V;
2564   }
2565 
2566   auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2567   V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2568   LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
2569   return V;
2570 }
2571 
2572 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2573                            unsigned BeginIndex, const Twine &Name) {
2574   VectorType *VecTy = cast<VectorType>(Old->getType());
2575   assert(VecTy && "Can only insert a vector into a vector");
2576 
2577   VectorType *Ty = dyn_cast<VectorType>(V->getType());
2578   if (!Ty) {
2579     // Single element to insert.
2580     V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2581                                 Name + ".insert");
2582     LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n");
2583     return V;
2584   }
2585 
2586   assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2587              cast<FixedVectorType>(VecTy)->getNumElements() &&
2588          "Too many elements!");
2589   if (cast<FixedVectorType>(Ty)->getNumElements() ==
2590       cast<FixedVectorType>(VecTy)->getNumElements()) {
2591     assert(V->getType() == VecTy && "Vector type mismatch");
2592     return V;
2593   }
2594   unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2595 
2596   // When inserting a smaller vector into the larger to store, we first
2597   // use a shuffle vector to widen it with undef elements, and then
2598   // a second shuffle vector to select between the loaded vector and the
2599   // incoming vector.
2600   SmallVector<int, 8> Mask;
2601   Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2602   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2603     if (i >= BeginIndex && i < EndIndex)
2604       Mask.push_back(i - BeginIndex);
2605     else
2606       Mask.push_back(-1);
2607   V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2608   LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
2609 
2610   SmallVector<Constant *, 8> Mask2;
2611   Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2612   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2613     Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2614 
2615   V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2616 
2617   LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
2618   return V;
2619 }
2620 
2621 namespace {
2622 
2623 /// Visitor to rewrite instructions using p particular slice of an alloca
2624 /// to use a new alloca.
2625 ///
2626 /// Also implements the rewriting to vector-based accesses when the partition
2627 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2628 /// lives here.
2629 class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2630   // Befriend the base class so it can delegate to private visit methods.
2631   friend class InstVisitor<AllocaSliceRewriter, bool>;
2632 
2633   using Base = InstVisitor<AllocaSliceRewriter, bool>;
2634 
2635   const DataLayout &DL;
2636   AllocaSlices &AS;
2637   SROA &Pass;
2638   AllocaInst &OldAI, &NewAI;
2639   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2640   Type *NewAllocaTy;
2641 
2642   // This is a convenience and flag variable that will be null unless the new
2643   // alloca's integer operations should be widened to this integer type due to
2644   // passing isIntegerWideningViable above. If it is non-null, the desired
2645   // integer type will be stored here for easy access during rewriting.
2646   IntegerType *IntTy;
2647 
2648   // If we are rewriting an alloca partition which can be written as pure
2649   // vector operations, we stash extra information here. When VecTy is
2650   // non-null, we have some strict guarantees about the rewritten alloca:
2651   //   - The new alloca is exactly the size of the vector type here.
2652   //   - The accesses all either map to the entire vector or to a single
2653   //     element.
2654   //   - The set of accessing instructions is only one of those handled above
2655   //     in isVectorPromotionViable. Generally these are the same access kinds
2656   //     which are promotable via mem2reg.
2657   VectorType *VecTy;
2658   Type *ElementTy;
2659   uint64_t ElementSize;
2660 
2661   // The original offset of the slice currently being rewritten relative to
2662   // the original alloca.
2663   uint64_t BeginOffset = 0;
2664   uint64_t EndOffset = 0;
2665 
2666   // The new offsets of the slice currently being rewritten relative to the
2667   // original alloca.
2668   uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2669 
2670   uint64_t SliceSize = 0;
2671   bool IsSplittable = false;
2672   bool IsSplit = false;
2673   Use *OldUse = nullptr;
2674   Instruction *OldPtr = nullptr;
2675 
2676   // Track post-rewrite users which are PHI nodes and Selects.
2677   SmallSetVector<PHINode *, 8> &PHIUsers;
2678   SmallSetVector<SelectInst *, 8> &SelectUsers;
2679 
2680   // Utility IR builder, whose name prefix is setup for each visited use, and
2681   // the insertion point is set to point to the user.
2682   IRBuilderTy IRB;
2683 
2684   // Return the new alloca, addrspacecasted if required to avoid changing the
2685   // addrspace of a volatile access.
2686   Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2687     if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2688       return &NewAI;
2689 
2690     Type *AccessTy = IRB.getPtrTy(AddrSpace);
2691     return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2692   }
2693 
2694 public:
2695   AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2696                       AllocaInst &OldAI, AllocaInst &NewAI,
2697                       uint64_t NewAllocaBeginOffset,
2698                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2699                       VectorType *PromotableVecTy,
2700                       SmallSetVector<PHINode *, 8> &PHIUsers,
2701                       SmallSetVector<SelectInst *, 8> &SelectUsers)
2702       : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2703         NewAllocaBeginOffset(NewAllocaBeginOffset),
2704         NewAllocaEndOffset(NewAllocaEndOffset),
2705         NewAllocaTy(NewAI.getAllocatedType()),
2706         IntTy(
2707             IsIntegerPromotable
2708                 ? Type::getIntNTy(NewAI.getContext(),
2709                                   DL.getTypeSizeInBits(NewAI.getAllocatedType())
2710                                       .getFixedValue())
2711                 : nullptr),
2712         VecTy(PromotableVecTy),
2713         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2714         ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2715                           : 0),
2716         PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2717         IRB(NewAI.getContext(), ConstantFolder()) {
2718     if (VecTy) {
2719       assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2720              "Only multiple-of-8 sized vector elements are viable");
2721       ++NumVectorized;
2722     }
2723     assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2724   }
2725 
2726   bool visit(AllocaSlices::const_iterator I) {
2727     bool CanSROA = true;
2728     BeginOffset = I->beginOffset();
2729     EndOffset = I->endOffset();
2730     IsSplittable = I->isSplittable();
2731     IsSplit =
2732         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2733     LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
2734     LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2735     LLVM_DEBUG(dbgs() << "\n");
2736 
2737     // Compute the intersecting offset range.
2738     assert(BeginOffset < NewAllocaEndOffset);
2739     assert(EndOffset > NewAllocaBeginOffset);
2740     NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2741     NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2742 
2743     SliceSize = NewEndOffset - NewBeginOffset;
2744     LLVM_DEBUG(dbgs() << "   Begin:(" << BeginOffset << ", " << EndOffset
2745                       << ") NewBegin:(" << NewBeginOffset << ", "
2746                       << NewEndOffset << ") NewAllocaBegin:("
2747                       << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2748                       << ")\n");
2749     assert(IsSplit || NewBeginOffset == BeginOffset);
2750     OldUse = I->getUse();
2751     OldPtr = cast<Instruction>(OldUse->get());
2752 
2753     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2754     IRB.SetInsertPoint(OldUserI);
2755     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2756     IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2757                                     Twine(BeginOffset) + ".");
2758 
2759     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2760     if (VecTy || IntTy)
2761       assert(CanSROA);
2762     return CanSROA;
2763   }
2764 
2765 private:
2766   // Make sure the other visit overloads are visible.
2767   using Base::visit;
2768 
2769   // Every instruction which can end up as a user must have a rewrite rule.
2770   bool visitInstruction(Instruction &I) {
2771     LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
2772     llvm_unreachable("No rewrite rule for this instruction!");
2773   }
2774 
2775   Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2776     // Note that the offset computation can use BeginOffset or NewBeginOffset
2777     // interchangeably for unsplit slices.
2778     assert(IsSplit || BeginOffset == NewBeginOffset);
2779     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2780 
2781 #ifndef NDEBUG
2782     StringRef OldName = OldPtr->getName();
2783     // Skip through the last '.sroa.' component of the name.
2784     size_t LastSROAPrefix = OldName.rfind(".sroa.");
2785     if (LastSROAPrefix != StringRef::npos) {
2786       OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2787       // Look for an SROA slice index.
2788       size_t IndexEnd = OldName.find_first_not_of("0123456789");
2789       if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2790         // Strip the index and look for the offset.
2791         OldName = OldName.substr(IndexEnd + 1);
2792         size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2793         if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2794           // Strip the offset.
2795           OldName = OldName.substr(OffsetEnd + 1);
2796       }
2797     }
2798     // Strip any SROA suffixes as well.
2799     OldName = OldName.substr(0, OldName.find(".sroa_"));
2800 #endif
2801 
2802     return getAdjustedPtr(IRB, DL, &NewAI,
2803                           APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2804                           PointerTy,
2805 #ifndef NDEBUG
2806                           Twine(OldName) + "."
2807 #else
2808                           Twine()
2809 #endif
2810     );
2811   }
2812 
2813   /// Compute suitable alignment to access this slice of the *new*
2814   /// alloca.
2815   ///
2816   /// You can optionally pass a type to this routine and if that type's ABI
2817   /// alignment is itself suitable, this will return zero.
2818   Align getSliceAlign() {
2819     return commonAlignment(NewAI.getAlign(),
2820                            NewBeginOffset - NewAllocaBeginOffset);
2821   }
2822 
2823   unsigned getIndex(uint64_t Offset) {
2824     assert(VecTy && "Can only call getIndex when rewriting a vector");
2825     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2826     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2827     uint32_t Index = RelOffset / ElementSize;
2828     assert(Index * ElementSize == RelOffset);
2829     return Index;
2830   }
2831 
2832   void deleteIfTriviallyDead(Value *V) {
2833     Instruction *I = cast<Instruction>(V);
2834     if (isInstructionTriviallyDead(I))
2835       Pass.DeadInsts.push_back(I);
2836   }
2837 
2838   Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2839     unsigned BeginIndex = getIndex(NewBeginOffset);
2840     unsigned EndIndex = getIndex(NewEndOffset);
2841     assert(EndIndex > BeginIndex && "Empty vector!");
2842 
2843     LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2844                                            NewAI.getAlign(), "load");
2845 
2846     Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2847                             LLVMContext::MD_access_group});
2848     return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2849   }
2850 
2851   Value *rewriteIntegerLoad(LoadInst &LI) {
2852     assert(IntTy && "We cannot insert an integer to the alloca");
2853     assert(!LI.isVolatile());
2854     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2855                                      NewAI.getAlign(), "load");
2856     V = convertValue(DL, IRB, V, IntTy);
2857     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2858     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2859     if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2860       IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2861       V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2862     }
2863     // It is possible that the extracted type is not the load type. This
2864     // happens if there is a load past the end of the alloca, and as
2865     // a consequence the slice is narrower but still a candidate for integer
2866     // lowering. To handle this case, we just zero extend the extracted
2867     // integer.
2868     assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2869            "Can only handle an extract for an overly wide load");
2870     if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2871       V = IRB.CreateZExt(V, LI.getType());
2872     return V;
2873   }
2874 
2875   bool visitLoadInst(LoadInst &LI) {
2876     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
2877     Value *OldOp = LI.getOperand(0);
2878     assert(OldOp == OldPtr);
2879 
2880     AAMDNodes AATags = LI.getAAMetadata();
2881 
2882     unsigned AS = LI.getPointerAddressSpace();
2883 
2884     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2885                              : LI.getType();
2886     const bool IsLoadPastEnd =
2887         DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize;
2888     bool IsPtrAdjusted = false;
2889     Value *V;
2890     if (VecTy) {
2891       V = rewriteVectorizedLoadInst(LI);
2892     } else if (IntTy && LI.getType()->isIntegerTy()) {
2893       V = rewriteIntegerLoad(LI);
2894     } else if (NewBeginOffset == NewAllocaBeginOffset &&
2895                NewEndOffset == NewAllocaEndOffset &&
2896                (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2897                 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2898                  TargetTy->isIntegerTy() && !LI.isVolatile()))) {
2899       Value *NewPtr =
2900           getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
2901       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
2902                                               NewAI.getAlign(), LI.isVolatile(),
2903                                               LI.getName());
2904       if (LI.isVolatile())
2905         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2906       if (NewLI->isAtomic())
2907         NewLI->setAlignment(LI.getAlign());
2908 
2909       // Copy any metadata that is valid for the new load. This may require
2910       // conversion to a different kind of metadata, e.g. !nonnull might change
2911       // to !range or vice versa.
2912       copyMetadataForLoad(*NewLI, LI);
2913 
2914       // Do this after copyMetadataForLoad() to preserve the TBAA shift.
2915       if (AATags)
2916         NewLI->setAAMetadata(AATags.adjustForAccess(
2917             NewBeginOffset - BeginOffset, NewLI->getType(), DL));
2918 
2919       // Try to preserve nonnull metadata
2920       V = NewLI;
2921 
2922       // If this is an integer load past the end of the slice (which means the
2923       // bytes outside the slice are undef or this load is dead) just forcibly
2924       // fix the integer size with correct handling of endianness.
2925       if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2926         if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2927           if (AITy->getBitWidth() < TITy->getBitWidth()) {
2928             V = IRB.CreateZExt(V, TITy, "load.ext");
2929             if (DL.isBigEndian())
2930               V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2931                                 "endian_shift");
2932           }
2933     } else {
2934       Type *LTy = IRB.getPtrTy(AS);
2935       LoadInst *NewLI =
2936           IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2937                                 getSliceAlign(), LI.isVolatile(), LI.getName());
2938 
2939       if (AATags)
2940         NewLI->setAAMetadata(AATags.adjustForAccess(
2941             NewBeginOffset - BeginOffset, NewLI->getType(), DL));
2942 
2943       if (LI.isVolatile())
2944         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2945       NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2946                                LLVMContext::MD_access_group});
2947 
2948       V = NewLI;
2949       IsPtrAdjusted = true;
2950     }
2951     V = convertValue(DL, IRB, V, TargetTy);
2952 
2953     if (IsSplit) {
2954       assert(!LI.isVolatile());
2955       assert(LI.getType()->isIntegerTy() &&
2956              "Only integer type loads and stores are split");
2957       assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
2958              "Split load isn't smaller than original load");
2959       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2960              "Non-byte-multiple bit width");
2961       // Move the insertion point just past the load so that we can refer to it.
2962       BasicBlock::iterator LIIt = std::next(LI.getIterator());
2963       // Ensure the insertion point comes before any debug-info immediately
2964       // after the load, so that variable values referring to the load are
2965       // dominated by it.
2966       LIIt.setHeadBit(true);
2967       IRB.SetInsertPoint(LI.getParent(), LIIt);
2968       // Create a placeholder value with the same type as LI to use as the
2969       // basis for the new value. This allows us to replace the uses of LI with
2970       // the computed value, and then replace the placeholder with LI, leaving
2971       // LI only used for this computation.
2972       Value *Placeholder =
2973           new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
2974                        false, Align(1));
2975       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2976                         "insert");
2977       LI.replaceAllUsesWith(V);
2978       Placeholder->replaceAllUsesWith(&LI);
2979       Placeholder->deleteValue();
2980     } else {
2981       LI.replaceAllUsesWith(V);
2982     }
2983 
2984     Pass.DeadInsts.push_back(&LI);
2985     deleteIfTriviallyDead(OldOp);
2986     LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
2987     return !LI.isVolatile() && !IsPtrAdjusted;
2988   }
2989 
2990   bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2991                                   AAMDNodes AATags) {
2992     // Capture V for the purpose of debug-info accounting once it's converted
2993     // to a vector store.
2994     Value *OrigV = V;
2995     if (V->getType() != VecTy) {
2996       unsigned BeginIndex = getIndex(NewBeginOffset);
2997       unsigned EndIndex = getIndex(NewEndOffset);
2998       assert(EndIndex > BeginIndex && "Empty vector!");
2999       unsigned NumElements = EndIndex - BeginIndex;
3000       assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3001              "Too many elements!");
3002       Type *SliceTy = (NumElements == 1)
3003                           ? ElementTy
3004                           : FixedVectorType::get(ElementTy, NumElements);
3005       if (V->getType() != SliceTy)
3006         V = convertValue(DL, IRB, V, SliceTy);
3007 
3008       // Mix in the existing elements.
3009       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3010                                          NewAI.getAlign(), "load");
3011       V = insertVector(IRB, Old, V, BeginIndex, "vec");
3012     }
3013     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3014     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3015                              LLVMContext::MD_access_group});
3016     if (AATags)
3017       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3018                                                   V->getType(), DL));
3019     Pass.DeadInsts.push_back(&SI);
3020 
3021     // NOTE: Careful to use OrigV rather than V.
3022     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3023                      Store, Store->getPointerOperand(), OrigV, DL);
3024     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3025     return true;
3026   }
3027 
3028   bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3029     assert(IntTy && "We cannot extract an integer from the alloca");
3030     assert(!SI.isVolatile());
3031     if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3032         IntTy->getBitWidth()) {
3033       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3034                                          NewAI.getAlign(), "oldload");
3035       Old = convertValue(DL, IRB, Old, IntTy);
3036       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3037       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3038       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3039     }
3040     V = convertValue(DL, IRB, V, NewAllocaTy);
3041     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3042     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3043                              LLVMContext::MD_access_group});
3044     if (AATags)
3045       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3046                                                   V->getType(), DL));
3047 
3048     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3049                      Store, Store->getPointerOperand(),
3050                      Store->getValueOperand(), DL);
3051 
3052     Pass.DeadInsts.push_back(&SI);
3053     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3054     return true;
3055   }
3056 
3057   bool visitStoreInst(StoreInst &SI) {
3058     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
3059     Value *OldOp = SI.getOperand(1);
3060     assert(OldOp == OldPtr);
3061 
3062     AAMDNodes AATags = SI.getAAMetadata();
3063     Value *V = SI.getValueOperand();
3064 
3065     // Strip all inbounds GEPs and pointer casts to try to dig out any root
3066     // alloca that should be re-examined after promoting this alloca.
3067     if (V->getType()->isPointerTy())
3068       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3069         Pass.PostPromotionWorklist.insert(AI);
3070 
3071     if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) {
3072       assert(!SI.isVolatile());
3073       assert(V->getType()->isIntegerTy() &&
3074              "Only integer type loads and stores are split");
3075       assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3076              "Non-byte-multiple bit width");
3077       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3078       V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3079                          "extract");
3080     }
3081 
3082     if (VecTy)
3083       return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3084     if (IntTy && V->getType()->isIntegerTy())
3085       return rewriteIntegerStore(V, SI, AATags);
3086 
3087     StoreInst *NewSI;
3088     if (NewBeginOffset == NewAllocaBeginOffset &&
3089         NewEndOffset == NewAllocaEndOffset &&
3090         canConvertValue(DL, V->getType(), NewAllocaTy)) {
3091       V = convertValue(DL, IRB, V, NewAllocaTy);
3092       Value *NewPtr =
3093           getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3094 
3095       NewSI =
3096           IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3097     } else {
3098       unsigned AS = SI.getPointerAddressSpace();
3099       Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3100       NewSI =
3101           IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3102     }
3103     NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3104                              LLVMContext::MD_access_group});
3105     if (AATags)
3106       NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3107                                                   V->getType(), DL));
3108     if (SI.isVolatile())
3109       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3110     if (NewSI->isAtomic())
3111       NewSI->setAlignment(SI.getAlign());
3112 
3113     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3114                      NewSI, NewSI->getPointerOperand(),
3115                      NewSI->getValueOperand(), DL);
3116 
3117     Pass.DeadInsts.push_back(&SI);
3118     deleteIfTriviallyDead(OldOp);
3119 
3120     LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
3121     return NewSI->getPointerOperand() == &NewAI &&
3122            NewSI->getValueOperand()->getType() == NewAllocaTy &&
3123            !SI.isVolatile();
3124   }
3125 
3126   /// Compute an integer value from splatting an i8 across the given
3127   /// number of bytes.
3128   ///
3129   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3130   /// call this routine.
3131   /// FIXME: Heed the advice above.
3132   ///
3133   /// \param V The i8 value to splat.
3134   /// \param Size The number of bytes in the output (assuming i8 is one byte)
3135   Value *getIntegerSplat(Value *V, unsigned Size) {
3136     assert(Size > 0 && "Expected a positive number of bytes.");
3137     IntegerType *VTy = cast<IntegerType>(V->getType());
3138     assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3139     if (Size == 1)
3140       return V;
3141 
3142     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3143     V = IRB.CreateMul(
3144         IRB.CreateZExt(V, SplatIntTy, "zext"),
3145         IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3146                        IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3147                                       SplatIntTy)),
3148         "isplat");
3149     return V;
3150   }
3151 
3152   /// Compute a vector splat for a given element value.
3153   Value *getVectorSplat(Value *V, unsigned NumElements) {
3154     V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3155     LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n");
3156     return V;
3157   }
3158 
3159   bool visitMemSetInst(MemSetInst &II) {
3160     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3161     assert(II.getRawDest() == OldPtr);
3162 
3163     AAMDNodes AATags = II.getAAMetadata();
3164 
3165     // If the memset has a variable size, it cannot be split, just adjust the
3166     // pointer to the new alloca.
3167     if (!isa<ConstantInt>(II.getLength())) {
3168       assert(!IsSplit);
3169       assert(NewBeginOffset == BeginOffset);
3170       II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3171       II.setDestAlignment(getSliceAlign());
3172       // In theory we should call migrateDebugInfo here. However, we do not
3173       // emit dbg.assign intrinsics for mem intrinsics storing through non-
3174       // constant geps, or storing a variable number of bytes.
3175       assert(at::getAssignmentMarkers(&II).empty() &&
3176              at::getDVRAssignmentMarkers(&II).empty() &&
3177              "AT: Unexpected link to non-const GEP");
3178       deleteIfTriviallyDead(OldPtr);
3179       return false;
3180     }
3181 
3182     // Record this instruction for deletion.
3183     Pass.DeadInsts.push_back(&II);
3184 
3185     Type *AllocaTy = NewAI.getAllocatedType();
3186     Type *ScalarTy = AllocaTy->getScalarType();
3187 
3188     const bool CanContinue = [&]() {
3189       if (VecTy || IntTy)
3190         return true;
3191       if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3192         return false;
3193       // Length must be in range for FixedVectorType.
3194       auto *C = cast<ConstantInt>(II.getLength());
3195       const uint64_t Len = C->getLimitedValue();
3196       if (Len > std::numeric_limits<unsigned>::max())
3197         return false;
3198       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3199       auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3200       return canConvertValue(DL, SrcTy, AllocaTy) &&
3201              DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3202     }();
3203 
3204     // If this doesn't map cleanly onto the alloca type, and that type isn't
3205     // a single value type, just emit a memset.
3206     if (!CanContinue) {
3207       Type *SizeTy = II.getLength()->getType();
3208       unsigned Sz = NewEndOffset - NewBeginOffset;
3209       Constant *Size = ConstantInt::get(SizeTy, Sz);
3210       MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3211           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3212           MaybeAlign(getSliceAlign()), II.isVolatile()));
3213       if (AATags)
3214         New->setAAMetadata(
3215             AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3216 
3217       migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3218                        New, New->getRawDest(), nullptr, DL);
3219 
3220       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3221       return false;
3222     }
3223 
3224     // If we can represent this as a simple value, we have to build the actual
3225     // value to store, which requires expanding the byte present in memset to
3226     // a sensible representation for the alloca type. This is essentially
3227     // splatting the byte to a sufficiently wide integer, splatting it across
3228     // any desired vector width, and bitcasting to the final type.
3229     Value *V;
3230 
3231     if (VecTy) {
3232       // If this is a memset of a vectorized alloca, insert it.
3233       assert(ElementTy == ScalarTy);
3234 
3235       unsigned BeginIndex = getIndex(NewBeginOffset);
3236       unsigned EndIndex = getIndex(NewEndOffset);
3237       assert(EndIndex > BeginIndex && "Empty vector!");
3238       unsigned NumElements = EndIndex - BeginIndex;
3239       assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3240              "Too many elements!");
3241 
3242       Value *Splat = getIntegerSplat(
3243           II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3244       Splat = convertValue(DL, IRB, Splat, ElementTy);
3245       if (NumElements > 1)
3246         Splat = getVectorSplat(Splat, NumElements);
3247 
3248       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3249                                          NewAI.getAlign(), "oldload");
3250       V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3251     } else if (IntTy) {
3252       // If this is a memset on an alloca where we can widen stores, insert the
3253       // set integer.
3254       assert(!II.isVolatile());
3255 
3256       uint64_t Size = NewEndOffset - NewBeginOffset;
3257       V = getIntegerSplat(II.getValue(), Size);
3258 
3259       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3260                     EndOffset != NewAllocaBeginOffset)) {
3261         Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3262                                            NewAI.getAlign(), "oldload");
3263         Old = convertValue(DL, IRB, Old, IntTy);
3264         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3265         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3266       } else {
3267         assert(V->getType() == IntTy &&
3268                "Wrong type for an alloca wide integer!");
3269       }
3270       V = convertValue(DL, IRB, V, AllocaTy);
3271     } else {
3272       // Established these invariants above.
3273       assert(NewBeginOffset == NewAllocaBeginOffset);
3274       assert(NewEndOffset == NewAllocaEndOffset);
3275 
3276       V = getIntegerSplat(II.getValue(),
3277                           DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3278       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3279         V = getVectorSplat(
3280             V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3281 
3282       V = convertValue(DL, IRB, V, AllocaTy);
3283     }
3284 
3285     Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3286     StoreInst *New =
3287         IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3288     New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3289                            LLVMContext::MD_access_group});
3290     if (AATags)
3291       New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3292                                                 V->getType(), DL));
3293 
3294     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3295                      New, New->getPointerOperand(), V, DL);
3296 
3297     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3298     return !II.isVolatile();
3299   }
3300 
3301   bool visitMemTransferInst(MemTransferInst &II) {
3302     // Rewriting of memory transfer instructions can be a bit tricky. We break
3303     // them into two categories: split intrinsics and unsplit intrinsics.
3304 
3305     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3306 
3307     AAMDNodes AATags = II.getAAMetadata();
3308 
3309     bool IsDest = &II.getRawDestUse() == OldUse;
3310     assert((IsDest && II.getRawDest() == OldPtr) ||
3311            (!IsDest && II.getRawSource() == OldPtr));
3312 
3313     Align SliceAlign = getSliceAlign();
3314     // For unsplit intrinsics, we simply modify the source and destination
3315     // pointers in place. This isn't just an optimization, it is a matter of
3316     // correctness. With unsplit intrinsics we may be dealing with transfers
3317     // within a single alloca before SROA ran, or with transfers that have
3318     // a variable length. We may also be dealing with memmove instead of
3319     // memcpy, and so simply updating the pointers is the necessary for us to
3320     // update both source and dest of a single call.
3321     if (!IsSplittable) {
3322       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3323       if (IsDest) {
3324         // Update the address component of linked dbg.assigns.
3325         auto UpdateAssignAddress = [&](auto *DbgAssign) {
3326           if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3327               DbgAssign->getAddress() == II.getDest())
3328             DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3329         };
3330         for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
3331         for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
3332         II.setDest(AdjustedPtr);
3333         II.setDestAlignment(SliceAlign);
3334       } else {
3335         II.setSource(AdjustedPtr);
3336         II.setSourceAlignment(SliceAlign);
3337       }
3338 
3339       LLVM_DEBUG(dbgs() << "          to: " << II << "\n");
3340       deleteIfTriviallyDead(OldPtr);
3341       return false;
3342     }
3343     // For split transfer intrinsics we have an incredibly useful assurance:
3344     // the source and destination do not reside within the same alloca, and at
3345     // least one of them does not escape. This means that we can replace
3346     // memmove with memcpy, and we don't need to worry about all manner of
3347     // downsides to splitting and transforming the operations.
3348 
3349     // If this doesn't map cleanly onto the alloca type, and that type isn't
3350     // a single value type, just emit a memcpy.
3351     bool EmitMemCpy =
3352         !VecTy && !IntTy &&
3353         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3354          SliceSize !=
3355              DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3356          !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3357          !NewAI.getAllocatedType()->isSingleValueType());
3358 
3359     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3360     // size hasn't been shrunk based on analysis of the viable range, this is
3361     // a no-op.
3362     if (EmitMemCpy && &OldAI == &NewAI) {
3363       // Ensure the start lines up.
3364       assert(NewBeginOffset == BeginOffset);
3365 
3366       // Rewrite the size as needed.
3367       if (NewEndOffset != EndOffset)
3368         II.setLength(ConstantInt::get(II.getLength()->getType(),
3369                                       NewEndOffset - NewBeginOffset));
3370       return false;
3371     }
3372     // Record this instruction for deletion.
3373     Pass.DeadInsts.push_back(&II);
3374 
3375     // Strip all inbounds GEPs and pointer casts to try to dig out any root
3376     // alloca that should be re-examined after rewriting this instruction.
3377     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3378     if (AllocaInst *AI =
3379             dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
3380       assert(AI != &OldAI && AI != &NewAI &&
3381              "Splittable transfers cannot reach the same alloca on both ends.");
3382       Pass.Worklist.insert(AI);
3383     }
3384 
3385     Type *OtherPtrTy = OtherPtr->getType();
3386     unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3387 
3388     // Compute the relative offset for the other pointer within the transfer.
3389     unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3390     APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3391     Align OtherAlign =
3392         (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3393     OtherAlign =
3394         commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3395 
3396     if (EmitMemCpy) {
3397       // Compute the other pointer, folding as much as possible to produce
3398       // a single, simple GEP in most cases.
3399       OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3400                                 OtherPtr->getName() + ".");
3401 
3402       Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3403       Type *SizeTy = II.getLength()->getType();
3404       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3405 
3406       Value *DestPtr, *SrcPtr;
3407       MaybeAlign DestAlign, SrcAlign;
3408       // Note: IsDest is true iff we're copying into the new alloca slice
3409       if (IsDest) {
3410         DestPtr = OurPtr;
3411         DestAlign = SliceAlign;
3412         SrcPtr = OtherPtr;
3413         SrcAlign = OtherAlign;
3414       } else {
3415         DestPtr = OtherPtr;
3416         DestAlign = OtherAlign;
3417         SrcPtr = OurPtr;
3418         SrcAlign = SliceAlign;
3419       }
3420       CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3421                                        Size, II.isVolatile());
3422       if (AATags)
3423         New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3424 
3425       APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3426       if (IsDest) {
3427         migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3428                          &II, New, DestPtr, nullptr, DL);
3429       } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3430                      DestPtr->stripAndAccumulateConstantOffsets(
3431                          DL, Offset, /*AllowNonInbounds*/ true))) {
3432         migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3433                          SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3434       }
3435       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3436       return false;
3437     }
3438 
3439     bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3440                          NewEndOffset == NewAllocaEndOffset;
3441     uint64_t Size = NewEndOffset - NewBeginOffset;
3442     unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3443     unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3444     unsigned NumElements = EndIndex - BeginIndex;
3445     IntegerType *SubIntTy =
3446         IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3447 
3448     // Reset the other pointer type to match the register type we're going to
3449     // use, but using the address space of the original other pointer.
3450     Type *OtherTy;
3451     if (VecTy && !IsWholeAlloca) {
3452       if (NumElements == 1)
3453         OtherTy = VecTy->getElementType();
3454       else
3455         OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3456     } else if (IntTy && !IsWholeAlloca) {
3457       OtherTy = SubIntTy;
3458     } else {
3459       OtherTy = NewAllocaTy;
3460     }
3461 
3462     Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3463                                    OtherPtr->getName() + ".");
3464     MaybeAlign SrcAlign = OtherAlign;
3465     MaybeAlign DstAlign = SliceAlign;
3466     if (!IsDest)
3467       std::swap(SrcAlign, DstAlign);
3468 
3469     Value *SrcPtr;
3470     Value *DstPtr;
3471 
3472     if (IsDest) {
3473       DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3474       SrcPtr = AdjPtr;
3475     } else {
3476       DstPtr = AdjPtr;
3477       SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3478     }
3479 
3480     Value *Src;
3481     if (VecTy && !IsWholeAlloca && !IsDest) {
3482       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3483                                   NewAI.getAlign(), "load");
3484       Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3485     } else if (IntTy && !IsWholeAlloca && !IsDest) {
3486       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3487                                   NewAI.getAlign(), "load");
3488       Src = convertValue(DL, IRB, Src, IntTy);
3489       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3490       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3491     } else {
3492       LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3493                                              II.isVolatile(), "copyload");
3494       Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3495                               LLVMContext::MD_access_group});
3496       if (AATags)
3497         Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3498                                                    Load->getType(), DL));
3499       Src = Load;
3500     }
3501 
3502     if (VecTy && !IsWholeAlloca && IsDest) {
3503       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3504                                          NewAI.getAlign(), "oldload");
3505       Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3506     } else if (IntTy && !IsWholeAlloca && IsDest) {
3507       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3508                                          NewAI.getAlign(), "oldload");
3509       Old = convertValue(DL, IRB, Old, IntTy);
3510       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3511       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3512       Src = convertValue(DL, IRB, Src, NewAllocaTy);
3513     }
3514 
3515     StoreInst *Store = cast<StoreInst>(
3516         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3517     Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3518                              LLVMContext::MD_access_group});
3519     if (AATags)
3520       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3521                                                   Src->getType(), DL));
3522 
3523     APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3524     if (IsDest) {
3525 
3526       migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3527                        Store, DstPtr, Src, DL);
3528     } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3529                    DstPtr->stripAndAccumulateConstantOffsets(
3530                        DL, Offset, /*AllowNonInbounds*/ true))) {
3531       migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3532                        &II, Store, DstPtr, Src, DL);
3533     }
3534 
3535     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3536     return !II.isVolatile();
3537   }
3538 
3539   bool visitIntrinsicInst(IntrinsicInst &II) {
3540     assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() ||
3541             II.isDroppable()) &&
3542            "Unexpected intrinsic!");
3543     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3544 
3545     // Record this instruction for deletion.
3546     Pass.DeadInsts.push_back(&II);
3547 
3548     if (II.isDroppable()) {
3549       assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3550       // TODO For now we forget assumed information, this can be improved.
3551       OldPtr->dropDroppableUsesIn(II);
3552       return true;
3553     }
3554 
3555     if (II.isLaunderOrStripInvariantGroup())
3556       return true;
3557 
3558     assert(II.getArgOperand(1) == OldPtr);
3559     // Lifetime intrinsics are only promotable if they cover the whole alloca.
3560     // Therefore, we drop lifetime intrinsics which don't cover the whole
3561     // alloca.
3562     // (In theory, intrinsics which partially cover an alloca could be
3563     // promoted, but PromoteMemToReg doesn't handle that case.)
3564     // FIXME: Check whether the alloca is promotable before dropping the
3565     // lifetime intrinsics?
3566     if (NewBeginOffset != NewAllocaBeginOffset ||
3567         NewEndOffset != NewAllocaEndOffset)
3568       return true;
3569 
3570     ConstantInt *Size =
3571         ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3572                          NewEndOffset - NewBeginOffset);
3573     // Lifetime intrinsics always expect an i8* so directly get such a pointer
3574     // for the new alloca slice.
3575     Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3576     Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3577     Value *New;
3578     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3579       New = IRB.CreateLifetimeStart(Ptr, Size);
3580     else
3581       New = IRB.CreateLifetimeEnd(Ptr, Size);
3582 
3583     (void)New;
3584     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3585 
3586     return true;
3587   }
3588 
3589   void fixLoadStoreAlign(Instruction &Root) {
3590     // This algorithm implements the same visitor loop as
3591     // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3592     // or store found.
3593     SmallPtrSet<Instruction *, 4> Visited;
3594     SmallVector<Instruction *, 4> Uses;
3595     Visited.insert(&Root);
3596     Uses.push_back(&Root);
3597     do {
3598       Instruction *I = Uses.pop_back_val();
3599 
3600       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3601         LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3602         continue;
3603       }
3604       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3605         SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3606         continue;
3607       }
3608 
3609       assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3610              isa<PHINode>(I) || isa<SelectInst>(I) ||
3611              isa<GetElementPtrInst>(I));
3612       for (User *U : I->users())
3613         if (Visited.insert(cast<Instruction>(U)).second)
3614           Uses.push_back(cast<Instruction>(U));
3615     } while (!Uses.empty());
3616   }
3617 
3618   bool visitPHINode(PHINode &PN) {
3619     LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
3620     assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3621     assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3622 
3623     // We would like to compute a new pointer in only one place, but have it be
3624     // as local as possible to the PHI. To do that, we re-use the location of
3625     // the old pointer, which necessarily must be in the right position to
3626     // dominate the PHI.
3627     IRBuilderBase::InsertPointGuard Guard(IRB);
3628     if (isa<PHINode>(OldPtr))
3629       IRB.SetInsertPoint(OldPtr->getParent(),
3630                          OldPtr->getParent()->getFirstInsertionPt());
3631     else
3632       IRB.SetInsertPoint(OldPtr);
3633     IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3634 
3635     Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3636     // Replace the operands which were using the old pointer.
3637     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3638 
3639     LLVM_DEBUG(dbgs() << "          to: " << PN << "\n");
3640     deleteIfTriviallyDead(OldPtr);
3641 
3642     // Fix the alignment of any loads or stores using this PHI node.
3643     fixLoadStoreAlign(PN);
3644 
3645     // PHIs can't be promoted on their own, but often can be speculated. We
3646     // check the speculation outside of the rewriter so that we see the
3647     // fully-rewritten alloca.
3648     PHIUsers.insert(&PN);
3649     return true;
3650   }
3651 
3652   bool visitSelectInst(SelectInst &SI) {
3653     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
3654     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3655            "Pointer isn't an operand!");
3656     assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3657     assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3658 
3659     Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3660     // Replace the operands which were using the old pointer.
3661     if (SI.getOperand(1) == OldPtr)
3662       SI.setOperand(1, NewPtr);
3663     if (SI.getOperand(2) == OldPtr)
3664       SI.setOperand(2, NewPtr);
3665 
3666     LLVM_DEBUG(dbgs() << "          to: " << SI << "\n");
3667     deleteIfTriviallyDead(OldPtr);
3668 
3669     // Fix the alignment of any loads or stores using this select.
3670     fixLoadStoreAlign(SI);
3671 
3672     // Selects can't be promoted on their own, but often can be speculated. We
3673     // check the speculation outside of the rewriter so that we see the
3674     // fully-rewritten alloca.
3675     SelectUsers.insert(&SI);
3676     return true;
3677   }
3678 };
3679 
3680 /// Visitor to rewrite aggregate loads and stores as scalar.
3681 ///
3682 /// This pass aggressively rewrites all aggregate loads and stores on
3683 /// a particular pointer (or any pointer derived from it which we can identify)
3684 /// with scalar loads and stores.
3685 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3686   // Befriend the base class so it can delegate to private visit methods.
3687   friend class InstVisitor<AggLoadStoreRewriter, bool>;
3688 
3689   /// Queue of pointer uses to analyze and potentially rewrite.
3690   SmallVector<Use *, 8> Queue;
3691 
3692   /// Set to prevent us from cycling with phi nodes and loops.
3693   SmallPtrSet<User *, 8> Visited;
3694 
3695   /// The current pointer use being rewritten. This is used to dig up the used
3696   /// value (as opposed to the user).
3697   Use *U = nullptr;
3698 
3699   /// Used to calculate offsets, and hence alignment, of subobjects.
3700   const DataLayout &DL;
3701 
3702   IRBuilderTy &IRB;
3703 
3704 public:
3705   AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3706       : DL(DL), IRB(IRB) {}
3707 
3708   /// Rewrite loads and stores through a pointer and all pointers derived from
3709   /// it.
3710   bool rewrite(Instruction &I) {
3711     LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
3712     enqueueUsers(I);
3713     bool Changed = false;
3714     while (!Queue.empty()) {
3715       U = Queue.pop_back_val();
3716       Changed |= visit(cast<Instruction>(U->getUser()));
3717     }
3718     return Changed;
3719   }
3720 
3721 private:
3722   /// Enqueue all the users of the given instruction for further processing.
3723   /// This uses a set to de-duplicate users.
3724   void enqueueUsers(Instruction &I) {
3725     for (Use &U : I.uses())
3726       if (Visited.insert(U.getUser()).second)
3727         Queue.push_back(&U);
3728   }
3729 
3730   // Conservative default is to not rewrite anything.
3731   bool visitInstruction(Instruction &I) { return false; }
3732 
3733   /// Generic recursive split emission class.
3734   template <typename Derived> class OpSplitter {
3735   protected:
3736     /// The builder used to form new instructions.
3737     IRBuilderTy &IRB;
3738 
3739     /// The indices which to be used with insert- or extractvalue to select the
3740     /// appropriate value within the aggregate.
3741     SmallVector<unsigned, 4> Indices;
3742 
3743     /// The indices to a GEP instruction which will move Ptr to the correct slot
3744     /// within the aggregate.
3745     SmallVector<Value *, 4> GEPIndices;
3746 
3747     /// The base pointer of the original op, used as a base for GEPing the
3748     /// split operations.
3749     Value *Ptr;
3750 
3751     /// The base pointee type being GEPed into.
3752     Type *BaseTy;
3753 
3754     /// Known alignment of the base pointer.
3755     Align BaseAlign;
3756 
3757     /// To calculate offset of each component so we can correctly deduce
3758     /// alignments.
3759     const DataLayout &DL;
3760 
3761     /// Initialize the splitter with an insertion point, Ptr and start with a
3762     /// single zero GEP index.
3763     OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3764                Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3765         : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
3766           BaseAlign(BaseAlign), DL(DL) {
3767       IRB.SetInsertPoint(InsertionPoint);
3768     }
3769 
3770   public:
3771     /// Generic recursive split emission routine.
3772     ///
3773     /// This method recursively splits an aggregate op (load or store) into
3774     /// scalar or vector ops. It splits recursively until it hits a single value
3775     /// and emits that single value operation via the template argument.
3776     ///
3777     /// The logic of this routine relies on GEPs and insertvalue and
3778     /// extractvalue all operating with the same fundamental index list, merely
3779     /// formatted differently (GEPs need actual values).
3780     ///
3781     /// \param Ty  The type being split recursively into smaller ops.
3782     /// \param Agg The aggregate value being built up or stored, depending on
3783     /// whether this is splitting a load or a store respectively.
3784     void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3785       if (Ty->isSingleValueType()) {
3786         unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3787         return static_cast<Derived *>(this)->emitFunc(
3788             Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3789       }
3790 
3791       if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3792         unsigned OldSize = Indices.size();
3793         (void)OldSize;
3794         for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3795              ++Idx) {
3796           assert(Indices.size() == OldSize && "Did not return to the old size");
3797           Indices.push_back(Idx);
3798           GEPIndices.push_back(IRB.getInt32(Idx));
3799           emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3800           GEPIndices.pop_back();
3801           Indices.pop_back();
3802         }
3803         return;
3804       }
3805 
3806       if (StructType *STy = dyn_cast<StructType>(Ty)) {
3807         unsigned OldSize = Indices.size();
3808         (void)OldSize;
3809         for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3810              ++Idx) {
3811           assert(Indices.size() == OldSize && "Did not return to the old size");
3812           Indices.push_back(Idx);
3813           GEPIndices.push_back(IRB.getInt32(Idx));
3814           emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3815           GEPIndices.pop_back();
3816           Indices.pop_back();
3817         }
3818         return;
3819       }
3820 
3821       llvm_unreachable("Only arrays and structs are aggregate loadable types");
3822     }
3823   };
3824 
3825   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3826     AAMDNodes AATags;
3827     // A vector to hold the split components that we want to emit
3828     // separate fake uses for.
3829     SmallVector<Value *, 4> Components;
3830     // A vector to hold all the fake uses of the struct that we are splitting.
3831     // Usually there should only be one, but we are handling the general case.
3832     SmallVector<Instruction *, 1> FakeUses;
3833 
3834     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3835                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3836                    IRBuilderTy &IRB)
3837         : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3838                                      IRB),
3839           AATags(AATags) {}
3840 
3841     /// Emit a leaf load of a single value. This is called at the leaves of the
3842     /// recursive emission to actually load values.
3843     void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3844       assert(Ty->isSingleValueType());
3845       // Load the single value and insert it using the indices.
3846       Value *GEP =
3847           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3848       LoadInst *Load =
3849           IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3850 
3851       APInt Offset(
3852           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3853       if (AATags &&
3854           GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3855         Load->setAAMetadata(
3856             AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
3857       // Record the load so we can generate a fake use for this aggregate
3858       // component.
3859       Components.push_back(Load);
3860 
3861       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3862       LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
3863     }
3864 
3865     // Stash the fake uses that use the value generated by this instruction.
3866     void recordFakeUses(LoadInst &LI) {
3867       for (Use &U : LI.uses())
3868         if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
3869           if (II->getIntrinsicID() == Intrinsic::fake_use)
3870             FakeUses.push_back(II);
3871     }
3872 
3873     // Replace all fake uses of the aggregate with a series of fake uses, one
3874     // for each split component.
3875     void emitFakeUses() {
3876       for (Instruction *I : FakeUses) {
3877         IRB.SetInsertPoint(I);
3878         for (auto *V : Components)
3879           IRB.CreateIntrinsic(Intrinsic::fake_use, {}, {V});
3880         I->eraseFromParent();
3881       }
3882     }
3883   };
3884 
3885   bool visitLoadInst(LoadInst &LI) {
3886     assert(LI.getPointerOperand() == *U);
3887     if (!LI.isSimple() || LI.getType()->isSingleValueType())
3888       return false;
3889 
3890     // We have an aggregate being loaded, split it apart.
3891     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
3892     LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3893                             getAdjustedAlignment(&LI, 0), DL, IRB);
3894     Splitter.recordFakeUses(LI);
3895     Value *V = PoisonValue::get(LI.getType());
3896     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3897     Splitter.emitFakeUses();
3898     Visited.erase(&LI);
3899     LI.replaceAllUsesWith(V);
3900     LI.eraseFromParent();
3901     return true;
3902   }
3903 
3904   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3905     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3906                     AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
3907                     const DataLayout &DL, IRBuilderTy &IRB)
3908         : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3909                                       DL, IRB),
3910           AATags(AATags), AggStore(AggStore) {}
3911     AAMDNodes AATags;
3912     StoreInst *AggStore;
3913     /// Emit a leaf store of a single value. This is called at the leaves of the
3914     /// recursive emission to actually produce stores.
3915     void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3916       assert(Ty->isSingleValueType());
3917       // Extract the single value and store it using the indices.
3918       //
3919       // The gep and extractvalue values are factored out of the CreateStore
3920       // call to make the output independent of the argument evaluation order.
3921       Value *ExtractValue =
3922           IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3923       Value *InBoundsGEP =
3924           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3925       StoreInst *Store =
3926           IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3927 
3928       APInt Offset(
3929           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3930       GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
3931       if (AATags) {
3932         Store->setAAMetadata(AATags.adjustForAccess(
3933             Offset.getZExtValue(), ExtractValue->getType(), DL));
3934       }
3935 
3936       // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
3937       // If we cannot (because there's an intervening non-const or unbounded
3938       // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
3939       // this instruction.
3940       Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
3941       if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
3942         uint64_t SizeInBits =
3943             DL.getTypeSizeInBits(Store->getValueOperand()->getType());
3944         migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
3945                          SizeInBits, AggStore, Store,
3946                          Store->getPointerOperand(), Store->getValueOperand(),
3947                          DL);
3948       } else {
3949         assert(at::getAssignmentMarkers(Store).empty() &&
3950                at::getDVRAssignmentMarkers(Store).empty() &&
3951                "AT: unexpected debug.assign linked to store through "
3952                "unbounded GEP");
3953       }
3954       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3955     }
3956   };
3957 
3958   bool visitStoreInst(StoreInst &SI) {
3959     if (!SI.isSimple() || SI.getPointerOperand() != *U)
3960       return false;
3961     Value *V = SI.getValueOperand();
3962     if (V->getType()->isSingleValueType())
3963       return false;
3964 
3965     // We have an aggregate being stored, split it apart.
3966     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
3967     StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
3968                              getAdjustedAlignment(&SI, 0), DL, IRB);
3969     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3970     Visited.erase(&SI);
3971     // The stores replacing SI each have markers describing fragments of the
3972     // assignment so delete the assignment markers linked to SI.
3973     at::deleteAssignmentMarkers(&SI);
3974     SI.eraseFromParent();
3975     return true;
3976   }
3977 
3978   bool visitBitCastInst(BitCastInst &BC) {
3979     enqueueUsers(BC);
3980     return false;
3981   }
3982 
3983   bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3984     enqueueUsers(ASC);
3985     return false;
3986   }
3987 
3988   // Unfold gep (select cond, ptr1, ptr2), idx
3989   //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
3990   // and  gep ptr, (select cond, idx1, idx2)
3991   //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
3992   bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
3993     // Check whether the GEP has exactly one select operand and all indices
3994     // will become constant after the transform.
3995     SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
3996     for (Value *Op : GEPI.indices()) {
3997       if (auto *SI = dyn_cast<SelectInst>(Op)) {
3998         if (Sel)
3999           return false;
4000 
4001         Sel = SI;
4002         if (!isa<ConstantInt>(Sel->getTrueValue()) ||
4003             !isa<ConstantInt>(Sel->getFalseValue()))
4004           return false;
4005         continue;
4006       }
4007 
4008       if (!isa<ConstantInt>(Op))
4009         return false;
4010     }
4011 
4012     if (!Sel)
4013       return false;
4014 
4015     LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
4016                dbgs() << "    original: " << *Sel << "\n";
4017                dbgs() << "              " << GEPI << "\n";);
4018 
4019     auto GetNewOps = [&](Value *SelOp) {
4020       SmallVector<Value *> NewOps;
4021       for (Value *Op : GEPI.operands())
4022         if (Op == Sel)
4023           NewOps.push_back(SelOp);
4024         else
4025           NewOps.push_back(Op);
4026       return NewOps;
4027     };
4028 
4029     Value *True = Sel->getTrueValue();
4030     Value *False = Sel->getFalseValue();
4031     SmallVector<Value *> TrueOps = GetNewOps(True);
4032     SmallVector<Value *> FalseOps = GetNewOps(False);
4033 
4034     IRB.SetInsertPoint(&GEPI);
4035     GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4036 
4037     Type *Ty = GEPI.getSourceElementType();
4038     Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4039                                  True->getName() + ".sroa.gep", NW);
4040 
4041     Value *NFalse =
4042         IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4043                       False->getName() + ".sroa.gep", NW);
4044 
4045     Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
4046                                    Sel->getName() + ".sroa.sel");
4047     Visited.erase(&GEPI);
4048     GEPI.replaceAllUsesWith(NSel);
4049     GEPI.eraseFromParent();
4050     Instruction *NSelI = cast<Instruction>(NSel);
4051     Visited.insert(NSelI);
4052     enqueueUsers(*NSelI);
4053 
4054     LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
4055                dbgs() << "              " << *NFalse << "\n";
4056                dbgs() << "              " << *NSel << "\n";);
4057 
4058     return true;
4059   }
4060 
4061   // Unfold gep (phi ptr1, ptr2), idx
4062   //   => phi ((gep ptr1, idx), (gep ptr2, idx))
4063   // and  gep ptr, (phi idx1, idx2)
4064   //   => phi ((gep ptr, idx1), (gep ptr, idx2))
4065   bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4066     // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4067     // operand (looking through the phi if it is the phi we want to unfold) is
4068     // an instruction besides a static alloca.
4069     PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4070     auto IsInvalidPointerOperand = [](Value *V) {
4071       if (!isa<Instruction>(V))
4072         return false;
4073       if (auto *AI = dyn_cast<AllocaInst>(V))
4074         return !AI->isStaticAlloca();
4075       return true;
4076     };
4077     if (Phi) {
4078       if (any_of(Phi->operands(), IsInvalidPointerOperand))
4079         return false;
4080     } else {
4081       if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4082         return false;
4083     }
4084     // Check whether the GEP has exactly one phi operand (including the pointer
4085     // operand) and all indices will become constant after the transform.
4086     for (Value *Op : GEPI.indices()) {
4087       if (auto *SI = dyn_cast<PHINode>(Op)) {
4088         if (Phi)
4089           return false;
4090 
4091         Phi = SI;
4092         if (!all_of(Phi->incoming_values(),
4093                     [](Value *V) { return isa<ConstantInt>(V); }))
4094           return false;
4095         continue;
4096       }
4097 
4098       if (!isa<ConstantInt>(Op))
4099         return false;
4100     }
4101 
4102     if (!Phi)
4103       return false;
4104 
4105     LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
4106                dbgs() << "    original: " << *Phi << "\n";
4107                dbgs() << "              " << GEPI << "\n";);
4108 
4109     auto GetNewOps = [&](Value *PhiOp) {
4110       SmallVector<Value *> NewOps;
4111       for (Value *Op : GEPI.operands())
4112         if (Op == Phi)
4113           NewOps.push_back(PhiOp);
4114         else
4115           NewOps.push_back(Op);
4116       return NewOps;
4117     };
4118 
4119     IRB.SetInsertPoint(Phi);
4120     PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4121                                     Phi->getName() + ".sroa.phi");
4122 
4123     Type *SourceTy = GEPI.getSourceElementType();
4124     // We only handle arguments, constants, and static allocas here, so we can
4125     // insert GEPs at the end of the entry block.
4126     IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4127     for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4128       Value *Op = Phi->getIncomingValue(I);
4129       BasicBlock *BB = Phi->getIncomingBlock(I);
4130       Value *NewGEP;
4131       if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4132         NewGEP = NewPhi->getIncomingValue(NI);
4133       } else {
4134         SmallVector<Value *> NewOps = GetNewOps(Op);
4135         NewGEP =
4136             IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4137                           Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4138       }
4139       NewPhi->addIncoming(NewGEP, BB);
4140     }
4141 
4142     Visited.erase(&GEPI);
4143     GEPI.replaceAllUsesWith(NewPhi);
4144     GEPI.eraseFromParent();
4145     Visited.insert(NewPhi);
4146     enqueueUsers(*NewPhi);
4147 
4148     LLVM_DEBUG(dbgs() << "          to: ";
4149                for (Value *In
4150                     : NewPhi->incoming_values()) dbgs()
4151                << "\n              " << *In;
4152                dbgs() << "\n              " << *NewPhi << '\n');
4153 
4154     return true;
4155   }
4156 
4157   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4158     if (unfoldGEPSelect(GEPI))
4159       return true;
4160 
4161     if (unfoldGEPPhi(GEPI))
4162       return true;
4163 
4164     enqueueUsers(GEPI);
4165     return false;
4166   }
4167 
4168   bool visitPHINode(PHINode &PN) {
4169     enqueueUsers(PN);
4170     return false;
4171   }
4172 
4173   bool visitSelectInst(SelectInst &SI) {
4174     enqueueUsers(SI);
4175     return false;
4176   }
4177 };
4178 
4179 } // end anonymous namespace
4180 
4181 /// Strip aggregate type wrapping.
4182 ///
4183 /// This removes no-op aggregate types wrapping an underlying type. It will
4184 /// strip as many layers of types as it can without changing either the type
4185 /// size or the allocated size.
4186 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4187   if (Ty->isSingleValueType())
4188     return Ty;
4189 
4190   uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4191   uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4192 
4193   Type *InnerTy;
4194   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4195     InnerTy = ArrTy->getElementType();
4196   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4197     const StructLayout *SL = DL.getStructLayout(STy);
4198     unsigned Index = SL->getElementContainingOffset(0);
4199     InnerTy = STy->getElementType(Index);
4200   } else {
4201     return Ty;
4202   }
4203 
4204   if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4205       TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4206     return Ty;
4207 
4208   return stripAggregateTypeWrapping(DL, InnerTy);
4209 }
4210 
4211 /// Try to find a partition of the aggregate type passed in for a given
4212 /// offset and size.
4213 ///
4214 /// This recurses through the aggregate type and tries to compute a subtype
4215 /// based on the offset and size. When the offset and size span a sub-section
4216 /// of an array, it will even compute a new array type for that sub-section,
4217 /// and the same for structs.
4218 ///
4219 /// Note that this routine is very strict and tries to find a partition of the
4220 /// type which produces the *exact* right offset and size. It is not forgiving
4221 /// when the size or offset cause either end of type-based partition to be off.
4222 /// Also, this is a best-effort routine. It is reasonable to give up and not
4223 /// return a type if necessary.
4224 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4225                               uint64_t Size) {
4226   if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4227     return stripAggregateTypeWrapping(DL, Ty);
4228   if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4229       (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4230     return nullptr;
4231 
4232   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4233     Type *ElementTy;
4234     uint64_t TyNumElements;
4235     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4236       ElementTy = AT->getElementType();
4237       TyNumElements = AT->getNumElements();
4238     } else {
4239       // FIXME: This isn't right for vectors with non-byte-sized or
4240       // non-power-of-two sized elements.
4241       auto *VT = cast<FixedVectorType>(Ty);
4242       ElementTy = VT->getElementType();
4243       TyNumElements = VT->getNumElements();
4244     }
4245     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4246     uint64_t NumSkippedElements = Offset / ElementSize;
4247     if (NumSkippedElements >= TyNumElements)
4248       return nullptr;
4249     Offset -= NumSkippedElements * ElementSize;
4250 
4251     // First check if we need to recurse.
4252     if (Offset > 0 || Size < ElementSize) {
4253       // Bail if the partition ends in a different array element.
4254       if ((Offset + Size) > ElementSize)
4255         return nullptr;
4256       // Recurse through the element type trying to peel off offset bytes.
4257       return getTypePartition(DL, ElementTy, Offset, Size);
4258     }
4259     assert(Offset == 0);
4260 
4261     if (Size == ElementSize)
4262       return stripAggregateTypeWrapping(DL, ElementTy);
4263     assert(Size > ElementSize);
4264     uint64_t NumElements = Size / ElementSize;
4265     if (NumElements * ElementSize != Size)
4266       return nullptr;
4267     return ArrayType::get(ElementTy, NumElements);
4268   }
4269 
4270   StructType *STy = dyn_cast<StructType>(Ty);
4271   if (!STy)
4272     return nullptr;
4273 
4274   const StructLayout *SL = DL.getStructLayout(STy);
4275 
4276   if (SL->getSizeInBits().isScalable())
4277     return nullptr;
4278 
4279   if (Offset >= SL->getSizeInBytes())
4280     return nullptr;
4281   uint64_t EndOffset = Offset + Size;
4282   if (EndOffset > SL->getSizeInBytes())
4283     return nullptr;
4284 
4285   unsigned Index = SL->getElementContainingOffset(Offset);
4286   Offset -= SL->getElementOffset(Index);
4287 
4288   Type *ElementTy = STy->getElementType(Index);
4289   uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4290   if (Offset >= ElementSize)
4291     return nullptr; // The offset points into alignment padding.
4292 
4293   // See if any partition must be contained by the element.
4294   if (Offset > 0 || Size < ElementSize) {
4295     if ((Offset + Size) > ElementSize)
4296       return nullptr;
4297     return getTypePartition(DL, ElementTy, Offset, Size);
4298   }
4299   assert(Offset == 0);
4300 
4301   if (Size == ElementSize)
4302     return stripAggregateTypeWrapping(DL, ElementTy);
4303 
4304   StructType::element_iterator EI = STy->element_begin() + Index,
4305                                EE = STy->element_end();
4306   if (EndOffset < SL->getSizeInBytes()) {
4307     unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4308     if (Index == EndIndex)
4309       return nullptr; // Within a single element and its padding.
4310 
4311     // Don't try to form "natural" types if the elements don't line up with the
4312     // expected size.
4313     // FIXME: We could potentially recurse down through the last element in the
4314     // sub-struct to find a natural end point.
4315     if (SL->getElementOffset(EndIndex) != EndOffset)
4316       return nullptr;
4317 
4318     assert(Index < EndIndex);
4319     EE = STy->element_begin() + EndIndex;
4320   }
4321 
4322   // Try to build up a sub-structure.
4323   StructType *SubTy =
4324       StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4325   const StructLayout *SubSL = DL.getStructLayout(SubTy);
4326   if (Size != SubSL->getSizeInBytes())
4327     return nullptr; // The sub-struct doesn't have quite the size needed.
4328 
4329   return SubTy;
4330 }
4331 
4332 /// Pre-split loads and stores to simplify rewriting.
4333 ///
4334 /// We want to break up the splittable load+store pairs as much as
4335 /// possible. This is important to do as a preprocessing step, as once we
4336 /// start rewriting the accesses to partitions of the alloca we lose the
4337 /// necessary information to correctly split apart paired loads and stores
4338 /// which both point into this alloca. The case to consider is something like
4339 /// the following:
4340 ///
4341 ///   %a = alloca [12 x i8]
4342 ///   %gep1 = getelementptr i8, ptr %a, i32 0
4343 ///   %gep2 = getelementptr i8, ptr %a, i32 4
4344 ///   %gep3 = getelementptr i8, ptr %a, i32 8
4345 ///   store float 0.0, ptr %gep1
4346 ///   store float 1.0, ptr %gep2
4347 ///   %v = load i64, ptr %gep1
4348 ///   store i64 %v, ptr %gep2
4349 ///   %f1 = load float, ptr %gep2
4350 ///   %f2 = load float, ptr %gep3
4351 ///
4352 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4353 /// promote everything so we recover the 2 SSA values that should have been
4354 /// there all along.
4355 ///
4356 /// \returns true if any changes are made.
4357 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4358   LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4359 
4360   // Track the loads and stores which are candidates for pre-splitting here, in
4361   // the order they first appear during the partition scan. These give stable
4362   // iteration order and a basis for tracking which loads and stores we
4363   // actually split.
4364   SmallVector<LoadInst *, 4> Loads;
4365   SmallVector<StoreInst *, 4> Stores;
4366 
4367   // We need to accumulate the splits required of each load or store where we
4368   // can find them via a direct lookup. This is important to cross-check loads
4369   // and stores against each other. We also track the slice so that we can kill
4370   // all the slices that end up split.
4371   struct SplitOffsets {
4372     Slice *S;
4373     std::vector<uint64_t> Splits;
4374   };
4375   SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4376 
4377   // Track loads out of this alloca which cannot, for any reason, be pre-split.
4378   // This is important as we also cannot pre-split stores of those loads!
4379   // FIXME: This is all pretty gross. It means that we can be more aggressive
4380   // in pre-splitting when the load feeding the store happens to come from
4381   // a separate alloca. Put another way, the effectiveness of SROA would be
4382   // decreased by a frontend which just concatenated all of its local allocas
4383   // into one big flat alloca. But defeating such patterns is exactly the job
4384   // SROA is tasked with! Sadly, to not have this discrepancy we would have
4385   // change store pre-splitting to actually force pre-splitting of the load
4386   // that feeds it *and all stores*. That makes pre-splitting much harder, but
4387   // maybe it would make it more principled?
4388   SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4389 
4390   LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
4391   for (auto &P : AS.partitions()) {
4392     for (Slice &S : P) {
4393       Instruction *I = cast<Instruction>(S.getUse()->getUser());
4394       if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4395         // If this is a load we have to track that it can't participate in any
4396         // pre-splitting. If this is a store of a load we have to track that
4397         // that load also can't participate in any pre-splitting.
4398         if (auto *LI = dyn_cast<LoadInst>(I))
4399           UnsplittableLoads.insert(LI);
4400         else if (auto *SI = dyn_cast<StoreInst>(I))
4401           if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4402             UnsplittableLoads.insert(LI);
4403         continue;
4404       }
4405       assert(P.endOffset() > S.beginOffset() &&
4406              "Empty or backwards partition!");
4407 
4408       // Determine if this is a pre-splittable slice.
4409       if (auto *LI = dyn_cast<LoadInst>(I)) {
4410         assert(!LI->isVolatile() && "Cannot split volatile loads!");
4411 
4412         // The load must be used exclusively to store into other pointers for
4413         // us to be able to arbitrarily pre-split it. The stores must also be
4414         // simple to avoid changing semantics.
4415         auto IsLoadSimplyStored = [](LoadInst *LI) {
4416           for (User *LU : LI->users()) {
4417             auto *SI = dyn_cast<StoreInst>(LU);
4418             if (!SI || !SI->isSimple())
4419               return false;
4420           }
4421           return true;
4422         };
4423         if (!IsLoadSimplyStored(LI)) {
4424           UnsplittableLoads.insert(LI);
4425           continue;
4426         }
4427 
4428         Loads.push_back(LI);
4429       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4430         if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4431           // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4432           continue;
4433         auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4434         if (!StoredLoad || !StoredLoad->isSimple())
4435           continue;
4436         assert(!SI->isVolatile() && "Cannot split volatile stores!");
4437 
4438         Stores.push_back(SI);
4439       } else {
4440         // Other uses cannot be pre-split.
4441         continue;
4442       }
4443 
4444       // Record the initial split.
4445       LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n");
4446       auto &Offsets = SplitOffsetsMap[I];
4447       assert(Offsets.Splits.empty() &&
4448              "Should not have splits the first time we see an instruction!");
4449       Offsets.S = &S;
4450       Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4451     }
4452 
4453     // Now scan the already split slices, and add a split for any of them which
4454     // we're going to pre-split.
4455     for (Slice *S : P.splitSliceTails()) {
4456       auto SplitOffsetsMapI =
4457           SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4458       if (SplitOffsetsMapI == SplitOffsetsMap.end())
4459         continue;
4460       auto &Offsets = SplitOffsetsMapI->second;
4461 
4462       assert(Offsets.S == S && "Found a mismatched slice!");
4463       assert(!Offsets.Splits.empty() &&
4464              "Cannot have an empty set of splits on the second partition!");
4465       assert(Offsets.Splits.back() ==
4466                  P.beginOffset() - Offsets.S->beginOffset() &&
4467              "Previous split does not end where this one begins!");
4468 
4469       // Record each split. The last partition's end isn't needed as the size
4470       // of the slice dictates that.
4471       if (S->endOffset() > P.endOffset())
4472         Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4473     }
4474   }
4475 
4476   // We may have split loads where some of their stores are split stores. For
4477   // such loads and stores, we can only pre-split them if their splits exactly
4478   // match relative to their starting offset. We have to verify this prior to
4479   // any rewriting.
4480   llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4481     // Lookup the load we are storing in our map of split
4482     // offsets.
4483     auto *LI = cast<LoadInst>(SI->getValueOperand());
4484     // If it was completely unsplittable, then we're done,
4485     // and this store can't be pre-split.
4486     if (UnsplittableLoads.count(LI))
4487       return true;
4488 
4489     auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4490     if (LoadOffsetsI == SplitOffsetsMap.end())
4491       return false; // Unrelated loads are definitely safe.
4492     auto &LoadOffsets = LoadOffsetsI->second;
4493 
4494     // Now lookup the store's offsets.
4495     auto &StoreOffsets = SplitOffsetsMap[SI];
4496 
4497     // If the relative offsets of each split in the load and
4498     // store match exactly, then we can split them and we
4499     // don't need to remove them here.
4500     if (LoadOffsets.Splits == StoreOffsets.Splits)
4501       return false;
4502 
4503     LLVM_DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
4504                       << "      " << *LI << "\n"
4505                       << "      " << *SI << "\n");
4506 
4507     // We've found a store and load that we need to split
4508     // with mismatched relative splits. Just give up on them
4509     // and remove both instructions from our list of
4510     // candidates.
4511     UnsplittableLoads.insert(LI);
4512     return true;
4513   });
4514   // Now we have to go *back* through all the stores, because a later store may
4515   // have caused an earlier store's load to become unsplittable and if it is
4516   // unsplittable for the later store, then we can't rely on it being split in
4517   // the earlier store either.
4518   llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4519     auto *LI = cast<LoadInst>(SI->getValueOperand());
4520     return UnsplittableLoads.count(LI);
4521   });
4522   // Once we've established all the loads that can't be split for some reason,
4523   // filter any that made it into our list out.
4524   llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4525     return UnsplittableLoads.count(LI);
4526   });
4527 
4528   // If no loads or stores are left, there is no pre-splitting to be done for
4529   // this alloca.
4530   if (Loads.empty() && Stores.empty())
4531     return false;
4532 
4533   // From here on, we can't fail and will be building new accesses, so rig up
4534   // an IR builder.
4535   IRBuilderTy IRB(&AI);
4536 
4537   // Collect the new slices which we will merge into the alloca slices.
4538   SmallVector<Slice, 4> NewSlices;
4539 
4540   // Track any allocas we end up splitting loads and stores for so we iterate
4541   // on them.
4542   SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4543 
4544   // At this point, we have collected all of the loads and stores we can
4545   // pre-split, and the specific splits needed for them. We actually do the
4546   // splitting in a specific order in order to handle when one of the loads in
4547   // the value operand to one of the stores.
4548   //
4549   // First, we rewrite all of the split loads, and just accumulate each split
4550   // load in a parallel structure. We also build the slices for them and append
4551   // them to the alloca slices.
4552   SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4553   std::vector<LoadInst *> SplitLoads;
4554   const DataLayout &DL = AI.getDataLayout();
4555   for (LoadInst *LI : Loads) {
4556     SplitLoads.clear();
4557 
4558     auto &Offsets = SplitOffsetsMap[LI];
4559     unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4560     assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4561            "Load must have type size equal to store size");
4562     assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4563            "Load must be >= slice size");
4564 
4565     uint64_t BaseOffset = Offsets.S->beginOffset();
4566     assert(BaseOffset + SliceSize > BaseOffset &&
4567            "Cannot represent alloca access size using 64-bit integers!");
4568 
4569     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
4570     IRB.SetInsertPoint(LI);
4571 
4572     LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
4573 
4574     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4575     int Idx = 0, Size = Offsets.Splits.size();
4576     for (;;) {
4577       auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4578       auto AS = LI->getPointerAddressSpace();
4579       auto *PartPtrTy = LI->getPointerOperandType();
4580       LoadInst *PLoad = IRB.CreateAlignedLoad(
4581           PartTy,
4582           getAdjustedPtr(IRB, DL, BasePtr,
4583                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
4584                          PartPtrTy, BasePtr->getName() + "."),
4585           getAdjustedAlignment(LI, PartOffset),
4586           /*IsVolatile*/ false, LI->getName());
4587       PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4588                                 LLVMContext::MD_access_group});
4589 
4590       // Append this load onto the list of split loads so we can find it later
4591       // to rewrite the stores.
4592       SplitLoads.push_back(PLoad);
4593 
4594       // Now build a new slice for the alloca.
4595       NewSlices.push_back(
4596           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4597                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4598                 /*IsSplittable*/ false));
4599       LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
4600                         << ", " << NewSlices.back().endOffset()
4601                         << "): " << *PLoad << "\n");
4602 
4603       // See if we've handled all the splits.
4604       if (Idx >= Size)
4605         break;
4606 
4607       // Setup the next partition.
4608       PartOffset = Offsets.Splits[Idx];
4609       ++Idx;
4610       PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4611     }
4612 
4613     // Now that we have the split loads, do the slow walk over all uses of the
4614     // load and rewrite them as split stores, or save the split loads to use
4615     // below if the store is going to be split there anyways.
4616     bool DeferredStores = false;
4617     for (User *LU : LI->users()) {
4618       StoreInst *SI = cast<StoreInst>(LU);
4619       if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4620         DeferredStores = true;
4621         LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI
4622                           << "\n");
4623         continue;
4624       }
4625 
4626       Value *StoreBasePtr = SI->getPointerOperand();
4627       IRB.SetInsertPoint(SI);
4628       AAMDNodes AATags = SI->getAAMetadata();
4629 
4630       LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
4631 
4632       for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4633         LoadInst *PLoad = SplitLoads[Idx];
4634         uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4635         auto *PartPtrTy = SI->getPointerOperandType();
4636 
4637         auto AS = SI->getPointerAddressSpace();
4638         StoreInst *PStore = IRB.CreateAlignedStore(
4639             PLoad,
4640             getAdjustedPtr(IRB, DL, StoreBasePtr,
4641                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
4642                            PartPtrTy, StoreBasePtr->getName() + "."),
4643             getAdjustedAlignment(SI, PartOffset),
4644             /*IsVolatile*/ false);
4645         PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4646                                    LLVMContext::MD_access_group,
4647                                    LLVMContext::MD_DIAssignID});
4648 
4649         if (AATags)
4650           PStore->setAAMetadata(
4651               AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4652         LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
4653       }
4654 
4655       // We want to immediately iterate on any allocas impacted by splitting
4656       // this store, and we have to track any promotable alloca (indicated by
4657       // a direct store) as needing to be resplit because it is no longer
4658       // promotable.
4659       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4660         ResplitPromotableAllocas.insert(OtherAI);
4661         Worklist.insert(OtherAI);
4662       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4663                      StoreBasePtr->stripInBoundsOffsets())) {
4664         Worklist.insert(OtherAI);
4665       }
4666 
4667       // Mark the original store as dead.
4668       DeadInsts.push_back(SI);
4669     }
4670 
4671     // Save the split loads if there are deferred stores among the users.
4672     if (DeferredStores)
4673       SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4674 
4675     // Mark the original load as dead and kill the original slice.
4676     DeadInsts.push_back(LI);
4677     Offsets.S->kill();
4678   }
4679 
4680   // Second, we rewrite all of the split stores. At this point, we know that
4681   // all loads from this alloca have been split already. For stores of such
4682   // loads, we can simply look up the pre-existing split loads. For stores of
4683   // other loads, we split those loads first and then write split stores of
4684   // them.
4685   for (StoreInst *SI : Stores) {
4686     auto *LI = cast<LoadInst>(SI->getValueOperand());
4687     IntegerType *Ty = cast<IntegerType>(LI->getType());
4688     assert(Ty->getBitWidth() % 8 == 0);
4689     uint64_t StoreSize = Ty->getBitWidth() / 8;
4690     assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4691 
4692     auto &Offsets = SplitOffsetsMap[SI];
4693     assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4694            "Slice size should always match load size exactly!");
4695     uint64_t BaseOffset = Offsets.S->beginOffset();
4696     assert(BaseOffset + StoreSize > BaseOffset &&
4697            "Cannot represent alloca access size using 64-bit integers!");
4698 
4699     Value *LoadBasePtr = LI->getPointerOperand();
4700     Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4701 
4702     LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
4703 
4704     // Check whether we have an already split load.
4705     auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4706     std::vector<LoadInst *> *SplitLoads = nullptr;
4707     if (SplitLoadsMapI != SplitLoadsMap.end()) {
4708       SplitLoads = &SplitLoadsMapI->second;
4709       assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4710              "Too few split loads for the number of splits in the store!");
4711     } else {
4712       LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n");
4713     }
4714 
4715     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4716     int Idx = 0, Size = Offsets.Splits.size();
4717     for (;;) {
4718       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4719       auto *LoadPartPtrTy = LI->getPointerOperandType();
4720       auto *StorePartPtrTy = SI->getPointerOperandType();
4721 
4722       // Either lookup a split load or create one.
4723       LoadInst *PLoad;
4724       if (SplitLoads) {
4725         PLoad = (*SplitLoads)[Idx];
4726       } else {
4727         IRB.SetInsertPoint(LI);
4728         auto AS = LI->getPointerAddressSpace();
4729         PLoad = IRB.CreateAlignedLoad(
4730             PartTy,
4731             getAdjustedPtr(IRB, DL, LoadBasePtr,
4732                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
4733                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
4734             getAdjustedAlignment(LI, PartOffset),
4735             /*IsVolatile*/ false, LI->getName());
4736         PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4737                                   LLVMContext::MD_access_group});
4738       }
4739 
4740       // And store this partition.
4741       IRB.SetInsertPoint(SI);
4742       auto AS = SI->getPointerAddressSpace();
4743       StoreInst *PStore = IRB.CreateAlignedStore(
4744           PLoad,
4745           getAdjustedPtr(IRB, DL, StoreBasePtr,
4746                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
4747                          StorePartPtrTy, StoreBasePtr->getName() + "."),
4748           getAdjustedAlignment(SI, PartOffset),
4749           /*IsVolatile*/ false);
4750       PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4751                                  LLVMContext::MD_access_group});
4752 
4753       // Now build a new slice for the alloca.
4754       NewSlices.push_back(
4755           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4756                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4757                 /*IsSplittable*/ false));
4758       LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
4759                         << ", " << NewSlices.back().endOffset()
4760                         << "): " << *PStore << "\n");
4761       if (!SplitLoads) {
4762         LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
4763       }
4764 
4765       // See if we've finished all the splits.
4766       if (Idx >= Size)
4767         break;
4768 
4769       // Setup the next partition.
4770       PartOffset = Offsets.Splits[Idx];
4771       ++Idx;
4772       PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4773     }
4774 
4775     // We want to immediately iterate on any allocas impacted by splitting
4776     // this load, which is only relevant if it isn't a load of this alloca and
4777     // thus we didn't already split the loads above. We also have to keep track
4778     // of any promotable allocas we split loads on as they can no longer be
4779     // promoted.
4780     if (!SplitLoads) {
4781       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4782         assert(OtherAI != &AI && "We can't re-split our own alloca!");
4783         ResplitPromotableAllocas.insert(OtherAI);
4784         Worklist.insert(OtherAI);
4785       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4786                      LoadBasePtr->stripInBoundsOffsets())) {
4787         assert(OtherAI != &AI && "We can't re-split our own alloca!");
4788         Worklist.insert(OtherAI);
4789       }
4790     }
4791 
4792     // Mark the original store as dead now that we've split it up and kill its
4793     // slice. Note that we leave the original load in place unless this store
4794     // was its only use. It may in turn be split up if it is an alloca load
4795     // for some other alloca, but it may be a normal load. This may introduce
4796     // redundant loads, but where those can be merged the rest of the optimizer
4797     // should handle the merging, and this uncovers SSA splits which is more
4798     // important. In practice, the original loads will almost always be fully
4799     // split and removed eventually, and the splits will be merged by any
4800     // trivial CSE, including instcombine.
4801     if (LI->hasOneUse()) {
4802       assert(*LI->user_begin() == SI && "Single use isn't this store!");
4803       DeadInsts.push_back(LI);
4804     }
4805     DeadInsts.push_back(SI);
4806     Offsets.S->kill();
4807   }
4808 
4809   // Remove the killed slices that have ben pre-split.
4810   llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4811 
4812   // Insert our new slices. This will sort and merge them into the sorted
4813   // sequence.
4814   AS.insert(NewSlices);
4815 
4816   LLVM_DEBUG(dbgs() << "  Pre-split slices:\n");
4817 #ifndef NDEBUG
4818   for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4819     LLVM_DEBUG(AS.print(dbgs(), I, "    "));
4820 #endif
4821 
4822   // Finally, don't try to promote any allocas that new require re-splitting.
4823   // They have already been added to the worklist above.
4824   PromotableAllocas.set_subtract(ResplitPromotableAllocas);
4825 
4826   return true;
4827 }
4828 
4829 /// Rewrite an alloca partition's users.
4830 ///
4831 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4832 /// to rewrite uses of an alloca partition to be conducive for SSA value
4833 /// promotion. If the partition needs a new, more refined alloca, this will
4834 /// build that new alloca, preserving as much type information as possible, and
4835 /// rewrite the uses of the old alloca to point at the new one and have the
4836 /// appropriate new offsets. It also evaluates how successful the rewrite was
4837 /// at enabling promotion and if it was successful queues the alloca to be
4838 /// promoted.
4839 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4840                                    Partition &P) {
4841   // Try to compute a friendly type for this partition of the alloca. This
4842   // won't always succeed, in which case we fall back to a legal integer type
4843   // or an i8 array of an appropriate size.
4844   Type *SliceTy = nullptr;
4845   VectorType *SliceVecTy = nullptr;
4846   const DataLayout &DL = AI.getDataLayout();
4847   std::pair<Type *, IntegerType *> CommonUseTy =
4848       findCommonType(P.begin(), P.end(), P.endOffset());
4849   // Do all uses operate on the same type?
4850   if (CommonUseTy.first)
4851     if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) {
4852       SliceTy = CommonUseTy.first;
4853       SliceVecTy = dyn_cast<VectorType>(SliceTy);
4854     }
4855   // If not, can we find an appropriate subtype in the original allocated type?
4856   if (!SliceTy)
4857     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4858                                                  P.beginOffset(), P.size()))
4859       SliceTy = TypePartitionTy;
4860 
4861   // If still not, can we use the largest bitwidth integer type used?
4862   if (!SliceTy && CommonUseTy.second)
4863     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
4864       SliceTy = CommonUseTy.second;
4865       SliceVecTy = dyn_cast<VectorType>(SliceTy);
4866     }
4867   if ((!SliceTy || (SliceTy->isArrayTy() &&
4868                     SliceTy->getArrayElementType()->isIntegerTy())) &&
4869       DL.isLegalInteger(P.size() * 8)) {
4870     SliceTy = Type::getIntNTy(*C, P.size() * 8);
4871   }
4872 
4873   // If the common use types are not viable for promotion then attempt to find
4874   // another type that is viable.
4875   if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
4876     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4877                                                  P.beginOffset(), P.size())) {
4878       VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
4879       if (TypePartitionVecTy &&
4880           checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
4881         SliceTy = TypePartitionTy;
4882     }
4883 
4884   if (!SliceTy)
4885     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4886   assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
4887 
4888   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4889 
4890   VectorType *VecTy =
4891       IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4892   if (VecTy)
4893     SliceTy = VecTy;
4894 
4895   // Check for the case where we're going to rewrite to a new alloca of the
4896   // exact same type as the original, and with the same access offsets. In that
4897   // case, re-use the existing alloca, but still run through the rewriter to
4898   // perform phi and select speculation.
4899   // P.beginOffset() can be non-zero even with the same type in a case with
4900   // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4901   AllocaInst *NewAI;
4902   if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4903     NewAI = &AI;
4904     // FIXME: We should be able to bail at this point with "nothing changed".
4905     // FIXME: We might want to defer PHI speculation until after here.
4906     // FIXME: return nullptr;
4907   } else {
4908     // Make sure the alignment is compatible with P.beginOffset().
4909     const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4910     // If we will get at least this much alignment from the type alone, leave
4911     // the alloca's alignment unconstrained.
4912     const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4913     NewAI = new AllocaInst(
4914         SliceTy, AI.getAddressSpace(), nullptr,
4915         IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4916         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
4917         AI.getIterator());
4918     // Copy the old AI debug location over to the new one.
4919     NewAI->setDebugLoc(AI.getDebugLoc());
4920     ++NumNewAllocas;
4921   }
4922 
4923   LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
4924                     << "," << P.endOffset() << ") to: " << *NewAI << "\n");
4925 
4926   // Track the high watermark on the worklist as it is only relevant for
4927   // promoted allocas. We will reset it to this point if the alloca is not in
4928   // fact scheduled for promotion.
4929   unsigned PPWOldSize = PostPromotionWorklist.size();
4930   unsigned NumUses = 0;
4931   SmallSetVector<PHINode *, 8> PHIUsers;
4932   SmallSetVector<SelectInst *, 8> SelectUsers;
4933 
4934   AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4935                                P.endOffset(), IsIntegerPromotable, VecTy,
4936                                PHIUsers, SelectUsers);
4937   bool Promotable = true;
4938   for (Slice *S : P.splitSliceTails()) {
4939     Promotable &= Rewriter.visit(S);
4940     ++NumUses;
4941   }
4942   for (Slice &S : P) {
4943     Promotable &= Rewriter.visit(&S);
4944     ++NumUses;
4945   }
4946 
4947   NumAllocaPartitionUses += NumUses;
4948   MaxUsesPerAllocaPartition.updateMax(NumUses);
4949 
4950   // Now that we've processed all the slices in the new partition, check if any
4951   // PHIs or Selects would block promotion.
4952   for (PHINode *PHI : PHIUsers)
4953     if (!isSafePHIToSpeculate(*PHI)) {
4954       Promotable = false;
4955       PHIUsers.clear();
4956       SelectUsers.clear();
4957       break;
4958     }
4959 
4960   SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
4961       NewSelectsToRewrite;
4962   NewSelectsToRewrite.reserve(SelectUsers.size());
4963   for (SelectInst *Sel : SelectUsers) {
4964     std::optional<RewriteableMemOps> Ops =
4965         isSafeSelectToSpeculate(*Sel, PreserveCFG);
4966     if (!Ops) {
4967       Promotable = false;
4968       PHIUsers.clear();
4969       SelectUsers.clear();
4970       NewSelectsToRewrite.clear();
4971       break;
4972     }
4973     NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
4974   }
4975 
4976   if (Promotable) {
4977     for (Use *U : AS.getDeadUsesIfPromotable()) {
4978       auto *OldInst = dyn_cast<Instruction>(U->get());
4979       Value::dropDroppableUse(*U);
4980       if (OldInst)
4981         if (isInstructionTriviallyDead(OldInst))
4982           DeadInsts.push_back(OldInst);
4983     }
4984     if (PHIUsers.empty() && SelectUsers.empty()) {
4985       // Promote the alloca.
4986       PromotableAllocas.insert(NewAI);
4987     } else {
4988       // If we have either PHIs or Selects to speculate, add them to those
4989       // worklists and re-queue the new alloca so that we promote in on the
4990       // next iteration.
4991       for (PHINode *PHIUser : PHIUsers)
4992         SpeculatablePHIs.insert(PHIUser);
4993       SelectsToRewrite.reserve(SelectsToRewrite.size() +
4994                                NewSelectsToRewrite.size());
4995       for (auto &&KV : llvm::make_range(
4996                std::make_move_iterator(NewSelectsToRewrite.begin()),
4997                std::make_move_iterator(NewSelectsToRewrite.end())))
4998         SelectsToRewrite.insert(std::move(KV));
4999       Worklist.insert(NewAI);
5000     }
5001   } else {
5002     // Drop any post-promotion work items if promotion didn't happen.
5003     while (PostPromotionWorklist.size() > PPWOldSize)
5004       PostPromotionWorklist.pop_back();
5005 
5006     // We couldn't promote and we didn't create a new partition, nothing
5007     // happened.
5008     if (NewAI == &AI)
5009       return nullptr;
5010 
5011     // If we can't promote the alloca, iterate on it to check for new
5012     // refinements exposed by splitting the current alloca. Don't iterate on an
5013     // alloca which didn't actually change and didn't get promoted.
5014     Worklist.insert(NewAI);
5015   }
5016 
5017   return NewAI;
5018 }
5019 
5020 // There isn't a shared interface to get the "address" parts out of a
5021 // dbg.declare and dbg.assign, so provide some wrappers now for
5022 // both debug intrinsics and records.
5023 const Value *getAddress(const DbgVariableIntrinsic *DVI) {
5024   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5025     return DAI->getAddress();
5026   return cast<DbgDeclareInst>(DVI)->getAddress();
5027 }
5028 
5029 const Value *getAddress(const DbgVariableRecord *DVR) {
5030   return DVR->getAddress();
5031 }
5032 
5033 bool isKillAddress(const DbgVariableIntrinsic *DVI) {
5034   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5035     return DAI->isKillAddress();
5036   return cast<DbgDeclareInst>(DVI)->isKillLocation();
5037 }
5038 
5039 bool isKillAddress(const DbgVariableRecord *DVR) {
5040   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5041     return DVR->isKillAddress();
5042   return DVR->isKillLocation();
5043 }
5044 
5045 const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
5046   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5047     return DAI->getAddressExpression();
5048   return cast<DbgDeclareInst>(DVI)->getExpression();
5049 }
5050 
5051 const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5052   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5053     return DVR->getAddressExpression();
5054   return DVR->getExpression();
5055 }
5056 
5057 /// Create or replace an existing fragment in a DIExpression with \p Frag.
5058 /// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5059 /// operation, add \p BitExtractOffset to the offset part.
5060 ///
5061 /// Returns the new expression, or nullptr if this fails (see details below).
5062 ///
5063 /// This function is similar to DIExpression::createFragmentExpression except
5064 /// for 3 important distinctions:
5065 ///   1. The new fragment isn't relative to an existing fragment.
5066 ///   2. It assumes the computed location is a memory location. This means we
5067 ///      don't need to perform checks that creating the fragment preserves the
5068 ///      expression semantics.
5069 ///   3. Existing extract_bits are modified independently of fragment changes
5070 ///      using \p BitExtractOffset. A change to the fragment offset or size
5071 ///      may affect a bit extract. But a bit extract offset can change
5072 ///      independently of the fragment dimensions.
5073 ///
5074 /// Returns the new expression, or nullptr if one couldn't be created.
5075 /// Ideally this is only used to signal that a bit-extract has become
5076 /// zero-sized (and thus the new debug record has no size and can be
5077 /// dropped), however, it fails for other reasons too - see the FIXME below.
5078 ///
5079 /// FIXME: To keep the change that introduces this function NFC it bails
5080 /// in some situations unecessarily, e.g. when fragment and bit extract
5081 /// sizes differ.
5082 static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5083                                              DIExpression::FragmentInfo Frag,
5084                                              int64_t BitExtractOffset) {
5085   SmallVector<uint64_t, 8> Ops;
5086   bool HasFragment = false;
5087   bool HasBitExtract = false;
5088 
5089   for (auto &Op : Expr->expr_ops()) {
5090     if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5091       HasFragment = true;
5092       continue;
5093     }
5094     if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5095         Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5096       HasBitExtract = true;
5097       int64_t ExtractOffsetInBits = Op.getArg(0);
5098       int64_t ExtractSizeInBits = Op.getArg(1);
5099 
5100       // DIExpression::createFragmentExpression doesn't know how to handle
5101       // a fragment that is smaller than the extract. Copy the behaviour
5102       // (bail) to avoid non-NFC changes.
5103       // FIXME: Don't do this.
5104       if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5105         return nullptr;
5106 
5107       assert(BitExtractOffset <= 0);
5108       int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5109 
5110       // DIExpression::createFragmentExpression doesn't know what to do
5111       // if the new extract starts "outside" the existing one. Copy the
5112       // behaviour (bail) to avoid non-NFC changes.
5113       // FIXME: Don't do this.
5114       if (AdjustedOffset < 0)
5115         return nullptr;
5116 
5117       Ops.push_back(Op.getOp());
5118       Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5119       Ops.push_back(ExtractSizeInBits);
5120       continue;
5121     }
5122     Op.appendToVector(Ops);
5123   }
5124 
5125   // Unsupported by createFragmentExpression, so don't support it here yet to
5126   // preserve NFC-ness.
5127   if (HasFragment && HasBitExtract)
5128     return nullptr;
5129 
5130   if (!HasBitExtract) {
5131     Ops.push_back(dwarf::DW_OP_LLVM_fragment);
5132     Ops.push_back(Frag.OffsetInBits);
5133     Ops.push_back(Frag.SizeInBits);
5134   }
5135   return DIExpression::get(Expr->getContext(), Ops);
5136 }
5137 
5138 /// Insert a new dbg.declare.
5139 /// \p Orig Original to copy debug loc and variable from.
5140 /// \p NewAddr Location's new base address.
5141 /// \p NewAddrExpr New expression to apply to address.
5142 /// \p BeforeInst Insert position.
5143 /// \p NewFragment New fragment (absolute, non-relative).
5144 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5145 static void
5146 insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
5147                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5148                  std::optional<DIExpression::FragmentInfo> NewFragment,
5149                  int64_t BitExtractAdjustment) {
5150   if (NewFragment)
5151     NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment,
5152                                           BitExtractAdjustment);
5153   if (!NewAddrExpr)
5154     return;
5155 
5156   DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
5157                     Orig->getDebugLoc(), BeforeInst);
5158 }
5159 
5160 /// Insert a new dbg.assign.
5161 /// \p Orig Original to copy debug loc, variable, value and value expression
5162 ///    from.
5163 /// \p NewAddr Location's new base address.
5164 /// \p NewAddrExpr New expression to apply to address.
5165 /// \p BeforeInst Insert position.
5166 /// \p NewFragment New fragment (absolute, non-relative).
5167 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5168 static void
5169 insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
5170                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5171                  std::optional<DIExpression::FragmentInfo> NewFragment,
5172                  int64_t BitExtractAdjustment) {
5173   // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
5174   (void)BeforeInst;
5175 
5176   // A dbg.assign puts fragment info in the value expression only. The address
5177   // expression has already been built: NewAddrExpr.
5178   DIExpression *NewFragmentExpr = Orig->getExpression();
5179   if (NewFragment)
5180     NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5181                                               BitExtractAdjustment);
5182   if (!NewFragmentExpr)
5183     return;
5184 
5185   // Apply a DIAssignID to the store if it doesn't already have it.
5186   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5187     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5188                          DIAssignID::getDistinct(NewAddr->getContext()));
5189   }
5190 
5191   Instruction *NewAssign = cast<Instruction *>(DIB.insertDbgAssign(
5192       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5193       NewAddrExpr, Orig->getDebugLoc()));
5194   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
5195   (void)NewAssign;
5196 }
5197 
5198 /// Insert a new DbgRecord.
5199 /// \p Orig Original to copy record type, debug loc and variable from, and
5200 ///    additionally value and value expression for dbg_assign records.
5201 /// \p NewAddr Location's new base address.
5202 /// \p NewAddrExpr New expression to apply to address.
5203 /// \p BeforeInst Insert position.
5204 /// \p NewFragment New fragment (absolute, non-relative).
5205 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5206 static void
5207 insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5208                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5209                  std::optional<DIExpression::FragmentInfo> NewFragment,
5210                  int64_t BitExtractAdjustment) {
5211   (void)DIB;
5212 
5213   // A dbg_assign puts fragment info in the value expression only. The address
5214   // expression has already been built: NewAddrExpr. A dbg_declare puts the
5215   // new fragment info into NewAddrExpr (as it only has one expression).
5216   DIExpression *NewFragmentExpr =
5217       Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5218   if (NewFragment)
5219     NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5220                                               BitExtractAdjustment);
5221   if (!NewFragmentExpr)
5222     return;
5223 
5224   if (Orig->isDbgDeclare()) {
5225     DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5226         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5227     BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5228                                                    BeforeInst->getIterator());
5229     return;
5230   }
5231 
5232   if (Orig->isDbgValue()) {
5233     DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5234         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5235     // Drop debug information if the expression doesn't start with a
5236     // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5237     // describes the address of alloca rather than the value inside the alloca.
5238     if (!NewFragmentExpr->startsWithDeref())
5239       DVR->setKillAddress();
5240     BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5241                                                    BeforeInst->getIterator());
5242     return;
5243   }
5244 
5245   // Apply a DIAssignID to the store if it doesn't already have it.
5246   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5247     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5248                          DIAssignID::getDistinct(NewAddr->getContext()));
5249   }
5250 
5251   DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5252       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5253       NewAddrExpr, Orig->getDebugLoc());
5254   LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5255   (void)NewAssign;
5256 }
5257 
5258 /// Walks the slices of an alloca and form partitions based on them,
5259 /// rewriting each of their uses.
5260 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5261   if (AS.begin() == AS.end())
5262     return false;
5263 
5264   unsigned NumPartitions = 0;
5265   bool Changed = false;
5266   const DataLayout &DL = AI.getModule()->getDataLayout();
5267 
5268   // First try to pre-split loads and stores.
5269   Changed |= presplitLoadsAndStores(AI, AS);
5270 
5271   // Now that we have identified any pre-splitting opportunities,
5272   // mark loads and stores unsplittable except for the following case.
5273   // We leave a slice splittable if all other slices are disjoint or fully
5274   // included in the slice, such as whole-alloca loads and stores.
5275   // If we fail to split these during pre-splitting, we want to force them
5276   // to be rewritten into a partition.
5277   bool IsSorted = true;
5278 
5279   uint64_t AllocaSize =
5280       DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5281   const uint64_t MaxBitVectorSize = 1024;
5282   if (AllocaSize <= MaxBitVectorSize) {
5283     // If a byte boundary is included in any load or store, a slice starting or
5284     // ending at the boundary is not splittable.
5285     SmallBitVector SplittableOffset(AllocaSize + 1, true);
5286     for (Slice &S : AS)
5287       for (unsigned O = S.beginOffset() + 1;
5288            O < S.endOffset() && O < AllocaSize; O++)
5289         SplittableOffset.reset(O);
5290 
5291     for (Slice &S : AS) {
5292       if (!S.isSplittable())
5293         continue;
5294 
5295       if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5296           (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5297         continue;
5298 
5299       if (isa<LoadInst>(S.getUse()->getUser()) ||
5300           isa<StoreInst>(S.getUse()->getUser())) {
5301         S.makeUnsplittable();
5302         IsSorted = false;
5303       }
5304     }
5305   } else {
5306     // We only allow whole-alloca splittable loads and stores
5307     // for a large alloca to avoid creating too large BitVector.
5308     for (Slice &S : AS) {
5309       if (!S.isSplittable())
5310         continue;
5311 
5312       if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5313         continue;
5314 
5315       if (isa<LoadInst>(S.getUse()->getUser()) ||
5316           isa<StoreInst>(S.getUse()->getUser())) {
5317         S.makeUnsplittable();
5318         IsSorted = false;
5319       }
5320     }
5321   }
5322 
5323   if (!IsSorted)
5324     llvm::stable_sort(AS);
5325 
5326   /// Describes the allocas introduced by rewritePartition in order to migrate
5327   /// the debug info.
5328   struct Fragment {
5329     AllocaInst *Alloca;
5330     uint64_t Offset;
5331     uint64_t Size;
5332     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5333         : Alloca(AI), Offset(O), Size(S) {}
5334   };
5335   SmallVector<Fragment, 4> Fragments;
5336 
5337   // Rewrite each partition.
5338   for (auto &P : AS.partitions()) {
5339     if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5340       Changed = true;
5341       if (NewAI != &AI) {
5342         uint64_t SizeOfByte = 8;
5343         uint64_t AllocaSize =
5344             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5345         // Don't include any padding.
5346         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5347         Fragments.push_back(
5348             Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5349       }
5350     }
5351     ++NumPartitions;
5352   }
5353 
5354   NumAllocaPartitions += NumPartitions;
5355   MaxPartitionsPerAlloca.updateMax(NumPartitions);
5356 
5357   // Migrate debug information from the old alloca to the new alloca(s)
5358   // and the individual partitions.
5359   auto MigrateOne = [&](auto *DbgVariable) {
5360     // Can't overlap with undef memory.
5361     if (isKillAddress(DbgVariable))
5362       return;
5363 
5364     const Value *DbgPtr = getAddress(DbgVariable);
5365     DIExpression::FragmentInfo VarFrag =
5366         DbgVariable->getFragmentOrEntireVariable();
5367     // Get the address expression constant offset if one exists and the ops
5368     // that come after it.
5369     int64_t CurrentExprOffsetInBytes = 0;
5370     SmallVector<uint64_t> PostOffsetOps;
5371     if (!getAddressExpression(DbgVariable)
5372              ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5373       return; // Couldn't interpret this DIExpression - drop the var.
5374 
5375     // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5376     int64_t ExtractOffsetInBits = 0;
5377     for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5378       if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5379           Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5380         ExtractOffsetInBits = Op.getArg(0);
5381         break;
5382       }
5383     }
5384 
5385     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5386     for (auto Fragment : Fragments) {
5387       int64_t OffsetFromLocationInBits;
5388       std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5389       // Find the variable fragment that the new alloca slice covers.
5390       // Drop debug info for this variable fragment if we can't compute an
5391       // intersect between it and the alloca slice.
5392       if (!DIExpression::calculateFragmentIntersect(
5393               DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5394               CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5395               NewDbgFragment, OffsetFromLocationInBits))
5396         continue; // Do not migrate this fragment to this slice.
5397 
5398       // Zero sized fragment indicates there's no intersect between the variable
5399       // fragment and the alloca slice. Skip this slice for this variable
5400       // fragment.
5401       if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5402         continue; // Do not migrate this fragment to this slice.
5403 
5404       // No fragment indicates DbgVariable's variable or fragment exactly
5405       // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5406       if (!NewDbgFragment)
5407         NewDbgFragment = DbgVariable->getFragment();
5408 
5409       // Reduce the new expression offset by the bit-extract offset since
5410       // we'll be keeping that.
5411       int64_t OffestFromNewAllocaInBits =
5412           OffsetFromLocationInBits - ExtractOffsetInBits;
5413       // We need to adjust an existing bit extract if the offset expression
5414       // can't eat the slack (i.e., if the new offset would be negative).
5415       int64_t BitExtractOffset =
5416           std::min<int64_t>(0, OffestFromNewAllocaInBits);
5417       // The magnitude of a negative value indicates the number of bits into
5418       // the existing variable fragment that the memory region begins. The new
5419       // variable fragment already excludes those bits - the new DbgPtr offset
5420       // only needs to be applied if it's positive.
5421       OffestFromNewAllocaInBits =
5422           std::max(int64_t(0), OffestFromNewAllocaInBits);
5423 
5424       // Rebuild the expression:
5425       //    {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5426       // Add NewDbgFragment later, because dbg.assigns don't want it in the
5427       // address expression but the value expression instead.
5428       DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5429       if (OffestFromNewAllocaInBits > 0) {
5430         int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5431         NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5432       }
5433 
5434       // Remove any existing intrinsics on the new alloca describing
5435       // the variable fragment.
5436       auto RemoveOne = [DbgVariable](auto *OldDII) {
5437         auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5438           return LHS->getVariable() == RHS->getVariable() &&
5439                  LHS->getDebugLoc()->getInlinedAt() ==
5440                      RHS->getDebugLoc()->getInlinedAt();
5441         };
5442         if (SameVariableFragment(OldDII, DbgVariable))
5443           OldDII->eraseFromParent();
5444       };
5445       for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
5446       for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5447       for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5448       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5449                        NewDbgFragment, BitExtractOffset);
5450     }
5451   };
5452 
5453   // Migrate debug information from the old alloca to the new alloca(s)
5454   // and the individual partitions.
5455   for_each(findDbgDeclares(&AI), MigrateOne);
5456   for_each(findDVRDeclares(&AI), MigrateOne);
5457   for_each(findDVRValues(&AI), MigrateOne);
5458   for_each(at::getAssignmentMarkers(&AI), MigrateOne);
5459   for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5460 
5461   return Changed;
5462 }
5463 
5464 /// Clobber a use with poison, deleting the used value if it becomes dead.
5465 void SROA::clobberUse(Use &U) {
5466   Value *OldV = U;
5467   // Replace the use with an poison value.
5468   U = PoisonValue::get(OldV->getType());
5469 
5470   // Check for this making an instruction dead. We have to garbage collect
5471   // all the dead instructions to ensure the uses of any alloca end up being
5472   // minimal.
5473   if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5474     if (isInstructionTriviallyDead(OldI)) {
5475       DeadInsts.push_back(OldI);
5476     }
5477 }
5478 
5479 /// A basic LoadAndStorePromoter that does not remove store nodes.
5480 class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5481 public:
5482   BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5483                             Type *ZeroType)
5484       : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5485   bool shouldDelete(Instruction *I) const override {
5486     return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5487   }
5488 
5489   Value *getValueToUseForAlloca(Instruction *I) const override {
5490     return UndefValue::get(ZeroType);
5491   }
5492 
5493 private:
5494   Type *ZeroType;
5495 };
5496 
5497 bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5498   // Look through each "partition", looking for slices with the same start/end
5499   // that do not overlap with any before them. The slices are sorted by
5500   // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5501   // sophisticated algorithm that takes splittable slices into account.
5502   auto PartitionBegin = AS.begin();
5503   auto PartitionEnd = PartitionBegin;
5504   uint64_t BeginOffset = PartitionBegin->beginOffset();
5505   uint64_t EndOffset = PartitionBegin->endOffset();
5506   while (PartitionBegin != AS.end()) {
5507     bool AllSameAndValid = true;
5508     SmallVector<Instruction *> Insts;
5509     Type *PartitionType = nullptr;
5510     while (PartitionEnd != AS.end() &&
5511            (PartitionEnd->beginOffset() < EndOffset ||
5512             PartitionEnd->endOffset() <= EndOffset)) {
5513       if (AllSameAndValid) {
5514         AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset &&
5515                            PartitionEnd->endOffset() == EndOffset;
5516         Instruction *User =
5517             cast<Instruction>(PartitionEnd->getUse()->getUser());
5518         if (auto *LI = dyn_cast<LoadInst>(User)) {
5519           Type *UserTy = LI->getType();
5520           // LoadAndStorePromoter requires all the types to be the same.
5521           if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5522             AllSameAndValid = false;
5523           PartitionType = UserTy;
5524           Insts.push_back(User);
5525         } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5526           Type *UserTy = SI->getValueOperand()->getType();
5527           if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5528             AllSameAndValid = false;
5529           PartitionType = UserTy;
5530           Insts.push_back(User);
5531         } else if (!isAssumeLikeIntrinsic(User)) {
5532           AllSameAndValid = false;
5533         }
5534       }
5535       EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
5536       ++PartitionEnd;
5537     }
5538 
5539     // So long as all the slices start and end offsets matched, update loads to
5540     // the values stored in the partition.
5541     if (AllSameAndValid && !Insts.empty()) {
5542       LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5543                         << EndOffset << ")\n");
5544       SmallVector<PHINode *, 4> NewPHIs;
5545       SSAUpdater SSA(&NewPHIs);
5546       Insts.push_back(&AI);
5547       BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5548       Promoter.run(Insts);
5549     }
5550 
5551     // Step on to the next partition.
5552     PartitionBegin = PartitionEnd;
5553     if (PartitionBegin == AS.end())
5554       break;
5555     BeginOffset = PartitionBegin->beginOffset();
5556     EndOffset = PartitionBegin->endOffset();
5557   }
5558   return true;
5559 }
5560 
5561 /// Analyze an alloca for SROA.
5562 ///
5563 /// This analyzes the alloca to ensure we can reason about it, builds
5564 /// the slices of the alloca, and then hands it off to be split and
5565 /// rewritten as needed.
5566 std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5567 SROA::runOnAlloca(AllocaInst &AI) {
5568   bool Changed = false;
5569   bool CFGChanged = false;
5570 
5571   LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5572   ++NumAllocasAnalyzed;
5573 
5574   // Special case dead allocas, as they're trivial.
5575   if (AI.use_empty()) {
5576     AI.eraseFromParent();
5577     Changed = true;
5578     return {Changed, CFGChanged};
5579   }
5580   const DataLayout &DL = AI.getDataLayout();
5581 
5582   // Skip alloca forms that this analysis can't handle.
5583   auto *AT = AI.getAllocatedType();
5584   TypeSize Size = DL.getTypeAllocSize(AT);
5585   if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5586       Size.getFixedValue() == 0)
5587     return {Changed, CFGChanged};
5588 
5589   // First, split any FCA loads and stores touching this alloca to promote
5590   // better splitting and promotion opportunities.
5591   IRBuilderTy IRB(&AI);
5592   AggLoadStoreRewriter AggRewriter(DL, IRB);
5593   Changed |= AggRewriter.rewrite(AI);
5594 
5595   // Build the slices using a recursive instruction-visiting builder.
5596   AllocaSlices AS(DL, AI);
5597   LLVM_DEBUG(AS.print(dbgs()));
5598   if (AS.isEscaped())
5599     return {Changed, CFGChanged};
5600 
5601   if (AS.isEscapedReadOnly()) {
5602     Changed |= propagateStoredValuesToLoads(AI, AS);
5603     return {Changed, CFGChanged};
5604   }
5605 
5606   // Delete all the dead users of this alloca before splitting and rewriting it.
5607   for (Instruction *DeadUser : AS.getDeadUsers()) {
5608     // Free up everything used by this instruction.
5609     for (Use &DeadOp : DeadUser->operands())
5610       clobberUse(DeadOp);
5611 
5612     // Now replace the uses of this instruction.
5613     DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5614 
5615     // And mark it for deletion.
5616     DeadInsts.push_back(DeadUser);
5617     Changed = true;
5618   }
5619   for (Use *DeadOp : AS.getDeadOperands()) {
5620     clobberUse(*DeadOp);
5621     Changed = true;
5622   }
5623 
5624   // No slices to split. Leave the dead alloca for a later pass to clean up.
5625   if (AS.begin() == AS.end())
5626     return {Changed, CFGChanged};
5627 
5628   Changed |= splitAlloca(AI, AS);
5629 
5630   LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
5631   while (!SpeculatablePHIs.empty())
5632     speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5633 
5634   LLVM_DEBUG(dbgs() << "  Rewriting Selects\n");
5635   auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5636   while (!RemainingSelectsToRewrite.empty()) {
5637     const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5638     CFGChanged |=
5639         rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5640   }
5641 
5642   return {Changed, CFGChanged};
5643 }
5644 
5645 /// Delete the dead instructions accumulated in this run.
5646 ///
5647 /// Recursively deletes the dead instructions we've accumulated. This is done
5648 /// at the very end to maximize locality of the recursive delete and to
5649 /// minimize the problems of invalidated instruction pointers as such pointers
5650 /// are used heavily in the intermediate stages of the algorithm.
5651 ///
5652 /// We also record the alloca instructions deleted here so that they aren't
5653 /// subsequently handed to mem2reg to promote.
5654 bool SROA::deleteDeadInstructions(
5655     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5656   bool Changed = false;
5657   while (!DeadInsts.empty()) {
5658     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5659     if (!I)
5660       continue;
5661     LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5662 
5663     // If the instruction is an alloca, find the possible dbg.declare connected
5664     // to it, and remove it too. We must do this before calling RAUW or we will
5665     // not be able to find it.
5666     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5667       DeletedAllocas.insert(AI);
5668       for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
5669         OldDII->eraseFromParent();
5670       for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5671         OldDII->eraseFromParent();
5672     }
5673 
5674     at::deleteAssignmentMarkers(I);
5675     I->replaceAllUsesWith(UndefValue::get(I->getType()));
5676 
5677     for (Use &Operand : I->operands())
5678       if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5679         // Zero out the operand and see if it becomes trivially dead.
5680         Operand = nullptr;
5681         if (isInstructionTriviallyDead(U))
5682           DeadInsts.push_back(U);
5683       }
5684 
5685     ++NumDeleted;
5686     I->eraseFromParent();
5687     Changed = true;
5688   }
5689   return Changed;
5690 }
5691 /// Promote the allocas, using the best available technique.
5692 ///
5693 /// This attempts to promote whatever allocas have been identified as viable in
5694 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
5695 /// This function returns whether any promotion occurred.
5696 bool SROA::promoteAllocas(Function &F) {
5697   if (PromotableAllocas.empty())
5698     return false;
5699 
5700   if (SROASkipMem2Reg) {
5701     LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5702   } else {
5703     LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5704     NumPromoted += PromotableAllocas.size();
5705     PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5706   }
5707 
5708   PromotableAllocas.clear();
5709   return true;
5710 }
5711 
5712 std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5713   LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5714 
5715   const DataLayout &DL = F.getDataLayout();
5716   BasicBlock &EntryBB = F.getEntryBlock();
5717   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5718        I != E; ++I) {
5719     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5720       if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
5721           isAllocaPromotable(AI))
5722         PromotableAllocas.insert(AI);
5723       else
5724         Worklist.insert(AI);
5725     }
5726   }
5727 
5728   bool Changed = false;
5729   bool CFGChanged = false;
5730   // A set of deleted alloca instruction pointers which should be removed from
5731   // the list of promotable allocas.
5732   SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5733 
5734   do {
5735     while (!Worklist.empty()) {
5736       auto [IterationChanged, IterationCFGChanged] =
5737           runOnAlloca(*Worklist.pop_back_val());
5738       Changed |= IterationChanged;
5739       CFGChanged |= IterationCFGChanged;
5740 
5741       Changed |= deleteDeadInstructions(DeletedAllocas);
5742 
5743       // Remove the deleted allocas from various lists so that we don't try to
5744       // continue processing them.
5745       if (!DeletedAllocas.empty()) {
5746         Worklist.set_subtract(DeletedAllocas);
5747         PostPromotionWorklist.set_subtract(DeletedAllocas);
5748         PromotableAllocas.set_subtract(DeletedAllocas);
5749         DeletedAllocas.clear();
5750       }
5751     }
5752 
5753     Changed |= promoteAllocas(F);
5754 
5755     Worklist = PostPromotionWorklist;
5756     PostPromotionWorklist.clear();
5757   } while (!Worklist.empty());
5758 
5759   assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
5760   assert((!CFGChanged || !PreserveCFG) &&
5761          "Should not have modified the CFG when told to preserve it.");
5762 
5763   if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
5764     for (auto &BB : F) {
5765       RemoveRedundantDbgInstrs(&BB);
5766     }
5767   }
5768 
5769   return {Changed, CFGChanged};
5770 }
5771 
5772 PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
5773   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
5774   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
5775   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5776   auto [Changed, CFGChanged] =
5777       SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5778   if (!Changed)
5779     return PreservedAnalyses::all();
5780   PreservedAnalyses PA;
5781   if (!CFGChanged)
5782     PA.preserveSet<CFGAnalyses>();
5783   PA.preserve<DominatorTreeAnalysis>();
5784   return PA;
5785 }
5786 
5787 void SROAPass::printPipeline(
5788     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
5789   static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
5790       OS, MapClassName2PassName);
5791   OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
5792                                                  : "<modify-cfg>");
5793 }
5794 
5795 SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
5796 
5797 namespace {
5798 
5799 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5800 class SROALegacyPass : public FunctionPass {
5801   SROAOptions PreserveCFG;
5802 
5803 public:
5804   static char ID;
5805 
5806   SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
5807       : FunctionPass(ID), PreserveCFG(PreserveCFG) {
5808     initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5809   }
5810 
5811   bool runOnFunction(Function &F) override {
5812     if (skipFunction(F))
5813       return false;
5814 
5815     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5816     AssumptionCache &AC =
5817         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5818     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5819     auto [Changed, _] =
5820         SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5821     return Changed;
5822   }
5823 
5824   void getAnalysisUsage(AnalysisUsage &AU) const override {
5825     AU.addRequired<AssumptionCacheTracker>();
5826     AU.addRequired<DominatorTreeWrapperPass>();
5827     AU.addPreserved<GlobalsAAWrapperPass>();
5828     AU.addPreserved<DominatorTreeWrapperPass>();
5829   }
5830 
5831   StringRef getPassName() const override { return "SROA"; }
5832 };
5833 
5834 } // end anonymous namespace
5835 
5836 char SROALegacyPass::ID = 0;
5837 
5838 FunctionPass *llvm::createSROAPass(bool PreserveCFG) {
5839   return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG
5840                                         : SROAOptions::ModifyCFG);
5841 }
5842 
5843 INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
5844                       "Scalar Replacement Of Aggregates", false, false)
5845 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5846 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5847 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
5848                     false, false)
5849