xref: /llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp (revision 81d18ad86419fc612c7071e888d11aa923eaeb8a)
1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the visit functions for load, store and alloca.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "InstCombineInternal.h"
14 #include "llvm/ADT/MapVector.h"
15 #include "llvm/ADT/SetOperations.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/Loads.h"
20 #include "llvm/IR/DataLayout.h"
21 #include "llvm/IR/DebugInfoMetadata.h"
22 #include "llvm/IR/IntrinsicInst.h"
23 #include "llvm/IR/LLVMContext.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Transforms/InstCombine/InstCombiner.h"
26 #include "llvm/Transforms/Utils/Local.h"
27 using namespace llvm;
28 using namespace PatternMatch;
29 
30 #define DEBUG_TYPE "instcombine"
31 
32 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
33 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
34 
35 static cl::opt<unsigned> MaxCopiedFromConstantUsers(
36     "instcombine-max-copied-from-constant-users", cl::init(300),
37     cl::desc("Maximum users to visit in copy from constant transform"),
38     cl::Hidden);
39 
40 /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived)
41 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
42 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
43 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
44 /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
45 /// the alloca, and if the source pointer is a pointer to a constant memory
46 /// location, we can optimize this.
47 static bool
48 isOnlyCopiedFromConstantMemory(AAResults *AA, AllocaInst *V,
49                                MemTransferInst *&TheCopy,
50                                SmallVectorImpl<Instruction *> &ToDelete) {
51   // We track lifetime intrinsics as we encounter them.  If we decide to go
52   // ahead and replace the value with the memory location, this lets the caller
53   // quickly eliminate the markers.
54 
55   using ValueAndIsOffset = PointerIntPair<Value *, 1, bool>;
56   SmallVector<ValueAndIsOffset, 32> Worklist;
57   SmallPtrSet<ValueAndIsOffset, 32> Visited;
58   Worklist.emplace_back(V, false);
59   while (!Worklist.empty()) {
60     ValueAndIsOffset Elem = Worklist.pop_back_val();
61     if (!Visited.insert(Elem).second)
62       continue;
63     if (Visited.size() > MaxCopiedFromConstantUsers)
64       return false;
65 
66     const auto [Value, IsOffset] = Elem;
67     for (auto &U : Value->uses()) {
68       auto *I = cast<Instruction>(U.getUser());
69 
70       if (auto *LI = dyn_cast<LoadInst>(I)) {
71         // Ignore non-volatile loads, they are always ok.
72         if (!LI->isSimple()) return false;
73         continue;
74       }
75 
76       if (isa<PHINode, SelectInst>(I)) {
77         // We set IsOffset=true, to forbid the memcpy from occurring after the
78         // phi: If one of the phi operands is not based on the alloca, we
79         // would incorrectly omit a write.
80         Worklist.emplace_back(I, true);
81         continue;
82       }
83       if (isa<BitCastInst, AddrSpaceCastInst>(I)) {
84         // If uses of the bitcast are ok, we are ok.
85         Worklist.emplace_back(I, IsOffset);
86         continue;
87       }
88       if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
89         // If the GEP has all zero indices, it doesn't offset the pointer. If it
90         // doesn't, it does.
91         Worklist.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
92         continue;
93       }
94 
95       if (auto *Call = dyn_cast<CallBase>(I)) {
96         // If this is the function being called then we treat it like a load and
97         // ignore it.
98         if (Call->isCallee(&U))
99           continue;
100 
101         unsigned DataOpNo = Call->getDataOperandNo(&U);
102         bool IsArgOperand = Call->isArgOperand(&U);
103 
104         // Inalloca arguments are clobbered by the call.
105         if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
106           return false;
107 
108         // If this call site doesn't modify the memory, then we know it is just
109         // a load (but one that potentially returns the value itself), so we can
110         // ignore it if we know that the value isn't captured.
111         bool NoCapture = Call->doesNotCapture(DataOpNo);
112         if ((Call->onlyReadsMemory() && (Call->use_empty() || NoCapture)) ||
113             (Call->onlyReadsMemory(DataOpNo) && NoCapture))
114           continue;
115       }
116 
117       // Lifetime intrinsics can be handled by the caller.
118       if (I->isLifetimeStartOrEnd()) {
119         assert(I->use_empty() && "Lifetime markers have no result to use!");
120         ToDelete.push_back(I);
121         continue;
122       }
123 
124       // If this is isn't our memcpy/memmove, reject it as something we can't
125       // handle.
126       MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
127       if (!MI)
128         return false;
129 
130       // If the transfer is volatile, reject it.
131       if (MI->isVolatile())
132         return false;
133 
134       // If the transfer is using the alloca as a source of the transfer, then
135       // ignore it since it is a load (unless the transfer is volatile).
136       if (U.getOperandNo() == 1)
137         continue;
138 
139       // If we already have seen a copy, reject the second one.
140       if (TheCopy) return false;
141 
142       // If the pointer has been offset from the start of the alloca, we can't
143       // safely handle this.
144       if (IsOffset) return false;
145 
146       // If the memintrinsic isn't using the alloca as the dest, reject it.
147       if (U.getOperandNo() != 0) return false;
148 
149       // If the source of the memcpy/move is not constant, reject it.
150       if (isModSet(AA->getModRefInfoMask(MI->getSource())))
151         return false;
152 
153       // Otherwise, the transform is safe.  Remember the copy instruction.
154       TheCopy = MI;
155     }
156   }
157   return true;
158 }
159 
160 /// isOnlyCopiedFromConstantMemory - Return true if the specified alloca is only
161 /// modified by a copy from a constant memory location. If we can prove this, we
162 /// can replace any uses of the alloca with uses of the memory location
163 /// directly.
164 static MemTransferInst *
165 isOnlyCopiedFromConstantMemory(AAResults *AA,
166                                AllocaInst *AI,
167                                SmallVectorImpl<Instruction *> &ToDelete) {
168   MemTransferInst *TheCopy = nullptr;
169   if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
170     return TheCopy;
171   return nullptr;
172 }
173 
174 /// Returns true if V is dereferenceable for size of alloca.
175 static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
176                                            const DataLayout &DL) {
177   if (AI->isArrayAllocation())
178     return false;
179   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
180   if (!AllocaSize)
181     return false;
182   return isDereferenceableAndAlignedPointer(V, AI->getAlign(),
183                                             APInt(64, AllocaSize), DL);
184 }
185 
186 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
187                                             AllocaInst &AI, DominatorTree &DT) {
188   // Check for array size of 1 (scalar allocation).
189   if (!AI.isArrayAllocation()) {
190     // i32 1 is the canonical array size for scalar allocations.
191     if (AI.getArraySize()->getType()->isIntegerTy(32))
192       return nullptr;
193 
194     // Canonicalize it.
195     return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
196   }
197 
198   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
199   if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
200     if (C->getValue().getActiveBits() <= 64) {
201       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
202       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(),
203                                                 nullptr, AI.getName());
204       New->setAlignment(AI.getAlign());
205       New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
206 
207       replaceAllDbgUsesWith(AI, *New, *New, DT);
208       return IC.replaceInstUsesWith(AI, New);
209     }
210   }
211 
212   if (isa<UndefValue>(AI.getArraySize()))
213     return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
214 
215   // Ensure that the alloca array size argument has type equal to the offset
216   // size of the alloca() pointer, which, in the tyical case, is intptr_t,
217   // so that any casting is exposed early.
218   Type *PtrIdxTy = IC.getDataLayout().getIndexType(AI.getType());
219   if (AI.getArraySize()->getType() != PtrIdxTy) {
220     Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), PtrIdxTy, false);
221     return IC.replaceOperand(AI, 0, V);
222   }
223 
224   return nullptr;
225 }
226 
227 namespace {
228 // If I and V are pointers in different address space, it is not allowed to
229 // use replaceAllUsesWith since I and V have different types. A
230 // non-target-specific transformation should not use addrspacecast on V since
231 // the two address space may be disjoint depending on target.
232 //
233 // This class chases down uses of the old pointer until reaching the load
234 // instructions, then replaces the old pointer in the load instructions with
235 // the new pointer. If during the chasing it sees bitcast or GEP, it will
236 // create new bitcast or GEP with the new pointer and use them in the load
237 // instruction.
238 class PointerReplacer {
239 public:
240   PointerReplacer(InstCombinerImpl &IC, Instruction &Root, unsigned SrcAS)
241       : IC(IC), Root(Root), FromAS(SrcAS) {}
242 
243   bool collectUsers();
244   void replacePointer(Value *V);
245 
246 private:
247   bool collectUsersRecursive(Instruction &I);
248   void replace(Instruction *I);
249   Value *getReplacement(Value *I);
250   bool isAvailable(Instruction *I) const {
251     return I == &Root || Worklist.contains(I);
252   }
253 
254   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
255                                    unsigned FromAS) const {
256     const auto *ASC = dyn_cast<AddrSpaceCastInst>(I);
257     if (!ASC)
258       return false;
259     unsigned ToAS = ASC->getDestAddressSpace();
260     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
261   }
262 
263   SmallPtrSet<Instruction *, 32> ValuesToRevisit;
264   SmallSetVector<Instruction *, 4> Worklist;
265   MapVector<Value *, Value *> WorkMap;
266   InstCombinerImpl &IC;
267   Instruction &Root;
268   unsigned FromAS;
269 };
270 } // end anonymous namespace
271 
272 bool PointerReplacer::collectUsers() {
273   if (!collectUsersRecursive(Root))
274     return false;
275 
276   // Ensure that all outstanding (indirect) users of I
277   // are inserted into the Worklist. Return false
278   // otherwise.
279   return llvm::set_is_subset(ValuesToRevisit, Worklist);
280 }
281 
282 bool PointerReplacer::collectUsersRecursive(Instruction &I) {
283   for (auto *U : I.users()) {
284     auto *Inst = cast<Instruction>(&*U);
285     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
286       if (Load->isVolatile())
287         return false;
288       Worklist.insert(Load);
289     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
290       // All incoming values must be instructions for replacability
291       if (any_of(PHI->incoming_values(),
292                  [](Value *V) { return !isa<Instruction>(V); }))
293         return false;
294 
295       // If at least one incoming value of the PHI is not in Worklist,
296       // store the PHI for revisiting and skip this iteration of the
297       // loop.
298       if (any_of(PHI->incoming_values(), [this](Value *V) {
299             return !isAvailable(cast<Instruction>(V));
300           })) {
301         ValuesToRevisit.insert(Inst);
302         continue;
303       }
304 
305       Worklist.insert(PHI);
306       if (!collectUsersRecursive(*PHI))
307         return false;
308     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
309       if (!isa<Instruction>(SI->getTrueValue()) ||
310           !isa<Instruction>(SI->getFalseValue()))
311         return false;
312 
313       if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
314           !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
315         ValuesToRevisit.insert(Inst);
316         continue;
317       }
318       Worklist.insert(SI);
319       if (!collectUsersRecursive(*SI))
320         return false;
321     } else if (isa<GetElementPtrInst>(Inst)) {
322       Worklist.insert(Inst);
323       if (!collectUsersRecursive(*Inst))
324         return false;
325     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
326       if (MI->isVolatile())
327         return false;
328       Worklist.insert(Inst);
329     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
330       Worklist.insert(Inst);
331       if (!collectUsersRecursive(*Inst))
332         return false;
333     } else if (Inst->isLifetimeStartOrEnd()) {
334       continue;
335     } else {
336       // TODO: For arbitrary uses with address space mismatches, should we check
337       // if we can introduce a valid addrspacecast?
338       LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
339       return false;
340     }
341   }
342 
343   return true;
344 }
345 
346 Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
347 
348 void PointerReplacer::replace(Instruction *I) {
349   if (getReplacement(I))
350     return;
351 
352   if (auto *LT = dyn_cast<LoadInst>(I)) {
353     auto *V = getReplacement(LT->getPointerOperand());
354     assert(V && "Operand not replaced");
355     auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
356                               LT->getAlign(), LT->getOrdering(),
357                               LT->getSyncScopeID());
358     NewI->takeName(LT);
359     copyMetadataForLoad(*NewI, *LT);
360 
361     IC.InsertNewInstWith(NewI, LT->getIterator());
362     IC.replaceInstUsesWith(*LT, NewI);
363     WorkMap[LT] = NewI;
364   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
365     Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
366     auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
367                                    PHI->getName(), PHI->getIterator());
368     for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
369       NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
370                           PHI->getIncomingBlock(I));
371     WorkMap[PHI] = NewPHI;
372   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
373     auto *V = getReplacement(GEP->getPointerOperand());
374     assert(V && "Operand not replaced");
375     SmallVector<Value *, 8> Indices(GEP->indices());
376     auto *NewI =
377         GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
378     IC.InsertNewInstWith(NewI, GEP->getIterator());
379     NewI->takeName(GEP);
380     NewI->setNoWrapFlags(GEP->getNoWrapFlags());
381     WorkMap[GEP] = NewI;
382   } else if (auto *SI = dyn_cast<SelectInst>(I)) {
383     Value *TrueValue = SI->getTrueValue();
384     Value *FalseValue = SI->getFalseValue();
385     if (Value *Replacement = getReplacement(TrueValue))
386       TrueValue = Replacement;
387     if (Value *Replacement = getReplacement(FalseValue))
388       FalseValue = Replacement;
389     auto *NewSI = SelectInst::Create(SI->getCondition(), TrueValue, FalseValue,
390                                      SI->getName(), nullptr, SI);
391     IC.InsertNewInstWith(NewSI, SI->getIterator());
392     NewSI->takeName(SI);
393     WorkMap[SI] = NewSI;
394   } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
395     auto *DestV = MemCpy->getRawDest();
396     auto *SrcV = MemCpy->getRawSource();
397 
398     if (auto *DestReplace = getReplacement(DestV))
399       DestV = DestReplace;
400     if (auto *SrcReplace = getReplacement(SrcV))
401       SrcV = SrcReplace;
402 
403     IC.Builder.SetInsertPoint(MemCpy);
404     auto *NewI = IC.Builder.CreateMemTransferInst(
405         MemCpy->getIntrinsicID(), DestV, MemCpy->getDestAlign(), SrcV,
406         MemCpy->getSourceAlign(), MemCpy->getLength(), MemCpy->isVolatile());
407     AAMDNodes AAMD = MemCpy->getAAMetadata();
408     if (AAMD)
409       NewI->setAAMetadata(AAMD);
410 
411     IC.eraseInstFromFunction(*MemCpy);
412     WorkMap[MemCpy] = NewI;
413   } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I)) {
414     auto *V = getReplacement(ASC->getPointerOperand());
415     assert(V && "Operand not replaced");
416     assert(isEqualOrValidAddrSpaceCast(
417                ASC, V->getType()->getPointerAddressSpace()) &&
418            "Invalid address space cast!");
419 
420     if (V->getType()->getPointerAddressSpace() !=
421         ASC->getType()->getPointerAddressSpace()) {
422       auto *NewI = new AddrSpaceCastInst(V, ASC->getType(), "");
423       NewI->takeName(ASC);
424       IC.InsertNewInstWith(NewI, ASC->getIterator());
425       WorkMap[ASC] = NewI;
426     } else {
427       WorkMap[ASC] = V;
428     }
429 
430   } else {
431     llvm_unreachable("should never reach here");
432   }
433 }
434 
435 void PointerReplacer::replacePointer(Value *V) {
436 #ifndef NDEBUG
437   auto *PT = cast<PointerType>(Root.getType());
438   auto *NT = cast<PointerType>(V->getType());
439   assert(PT != NT && "Invalid usage");
440 #endif
441   WorkMap[&Root] = V;
442 
443   for (Instruction *Workitem : Worklist)
444     replace(Workitem);
445 }
446 
447 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
448   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
449     return I;
450 
451   if (AI.getAllocatedType()->isSized()) {
452     // Move all alloca's of zero byte objects to the entry block and merge them
453     // together.  Note that we only do this for alloca's, because malloc should
454     // allocate and return a unique pointer, even for a zero byte allocation.
455     if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinValue() == 0) {
456       // For a zero sized alloca there is no point in doing an array allocation.
457       // This is helpful if the array size is a complicated expression not used
458       // elsewhere.
459       if (AI.isArrayAllocation())
460         return replaceOperand(AI, 0,
461             ConstantInt::get(AI.getArraySize()->getType(), 1));
462 
463       // Get the first instruction in the entry block.
464       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
465       BasicBlock::iterator FirstInst = EntryBlock.getFirstNonPHIOrDbg();
466       if (&*FirstInst != &AI) {
467         // If the entry block doesn't start with a zero-size alloca then move
468         // this one to the start of the entry block.  There is no problem with
469         // dominance as the array size was forced to a constant earlier already.
470         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
471         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
472             DL.getTypeAllocSize(EntryAI->getAllocatedType())
473                     .getKnownMinValue() != 0) {
474           AI.moveBefore(FirstInst);
475           return &AI;
476         }
477 
478         // Replace this zero-sized alloca with the one at the start of the entry
479         // block after ensuring that the address will be aligned enough for both
480         // types.
481         const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
482         EntryAI->setAlignment(MaxAlign);
483         return replaceInstUsesWith(AI, EntryAI);
484       }
485     }
486   }
487 
488   // Check to see if this allocation is only modified by a memcpy/memmove from
489   // a memory location whose alignment is equal to or exceeds that of the
490   // allocation. If this is the case, we can change all users to use the
491   // constant memory location instead.  This is commonly produced by the CFE by
492   // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
493   // is only subsequently read.
494   SmallVector<Instruction *, 4> ToDelete;
495   if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
496     Value *TheSrc = Copy->getSource();
497     Align AllocaAlign = AI.getAlign();
498     Align SourceAlign = getOrEnforceKnownAlignment(
499       TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
500     if (AllocaAlign <= SourceAlign &&
501         isDereferenceableForAllocaSize(TheSrc, &AI, DL) &&
502         !isa<Instruction>(TheSrc)) {
503       // FIXME: Can we sink instructions without violating dominance when TheSrc
504       // is an instruction instead of a constant or argument?
505       LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
506       LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
507       unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
508       if (AI.getAddressSpace() == SrcAddrSpace) {
509         for (Instruction *Delete : ToDelete)
510           eraseInstFromFunction(*Delete);
511 
512         Instruction *NewI = replaceInstUsesWith(AI, TheSrc);
513         eraseInstFromFunction(*Copy);
514         ++NumGlobalCopies;
515         return NewI;
516       }
517 
518       PointerReplacer PtrReplacer(*this, AI, SrcAddrSpace);
519       if (PtrReplacer.collectUsers()) {
520         for (Instruction *Delete : ToDelete)
521           eraseInstFromFunction(*Delete);
522 
523         PtrReplacer.replacePointer(TheSrc);
524         ++NumGlobalCopies;
525       }
526     }
527   }
528 
529   // At last, use the generic allocation site handler to aggressively remove
530   // unused allocas.
531   return visitAllocSite(AI);
532 }
533 
534 // Are we allowed to form a atomic load or store of this type?
535 static bool isSupportedAtomicType(Type *Ty) {
536   return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
537 }
538 
539 /// Helper to combine a load to a new type.
540 ///
541 /// This just does the work of combining a load to a new type. It handles
542 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
543 /// loaded *value* type. This will convert it to a pointer, cast the operand to
544 /// that pointer type, load it, etc.
545 ///
546 /// Note that this will create all of the instructions with whatever insert
547 /// point the \c InstCombinerImpl currently is using.
548 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
549                                                  const Twine &Suffix) {
550   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
551          "can't fold an atomic load to requested type");
552 
553   LoadInst *NewLoad =
554       Builder.CreateAlignedLoad(NewTy, LI.getPointerOperand(), LI.getAlign(),
555                                 LI.isVolatile(), LI.getName() + Suffix);
556   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
557   copyMetadataForLoad(*NewLoad, LI);
558   return NewLoad;
559 }
560 
561 /// Combine a store to a new type.
562 ///
563 /// Returns the newly created store instruction.
564 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
565                                          Value *V) {
566   assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
567          "can't fold an atomic store of requested type");
568 
569   Value *Ptr = SI.getPointerOperand();
570   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
571   SI.getAllMetadata(MD);
572 
573   StoreInst *NewStore =
574       IC.Builder.CreateAlignedStore(V, Ptr, SI.getAlign(), SI.isVolatile());
575   NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
576   for (const auto &MDPair : MD) {
577     unsigned ID = MDPair.first;
578     MDNode *N = MDPair.second;
579     // Note, essentially every kind of metadata should be preserved here! This
580     // routine is supposed to clone a store instruction changing *only its
581     // type*. The only metadata it makes sense to drop is metadata which is
582     // invalidated when the pointer type changes. This should essentially
583     // never be the case in LLVM, but we explicitly switch over only known
584     // metadata to be conservatively correct. If you are adding metadata to
585     // LLVM which pertains to stores, you almost certainly want to add it
586     // here.
587     switch (ID) {
588     case LLVMContext::MD_dbg:
589     case LLVMContext::MD_DIAssignID:
590     case LLVMContext::MD_tbaa:
591     case LLVMContext::MD_prof:
592     case LLVMContext::MD_fpmath:
593     case LLVMContext::MD_tbaa_struct:
594     case LLVMContext::MD_alias_scope:
595     case LLVMContext::MD_noalias:
596     case LLVMContext::MD_nontemporal:
597     case LLVMContext::MD_mem_parallel_loop_access:
598     case LLVMContext::MD_access_group:
599       // All of these directly apply.
600       NewStore->setMetadata(ID, N);
601       break;
602     case LLVMContext::MD_invariant_load:
603     case LLVMContext::MD_nonnull:
604     case LLVMContext::MD_noundef:
605     case LLVMContext::MD_range:
606     case LLVMContext::MD_align:
607     case LLVMContext::MD_dereferenceable:
608     case LLVMContext::MD_dereferenceable_or_null:
609       // These don't apply for stores.
610       break;
611     }
612   }
613 
614   return NewStore;
615 }
616 
617 /// Combine loads to match the type of their uses' value after looking
618 /// through intervening bitcasts.
619 ///
620 /// The core idea here is that if the result of a load is used in an operation,
621 /// we should load the type most conducive to that operation. For example, when
622 /// loading an integer and converting that immediately to a pointer, we should
623 /// instead directly load a pointer.
624 ///
625 /// However, this routine must never change the width of a load or the number of
626 /// loads as that would introduce a semantic change. This combine is expected to
627 /// be a semantic no-op which just allows loads to more closely model the types
628 /// of their consuming operations.
629 ///
630 /// Currently, we also refuse to change the precise type used for an atomic load
631 /// or a volatile load. This is debatable, and might be reasonable to change
632 /// later. However, it is risky in case some backend or other part of LLVM is
633 /// relying on the exact type loaded to select appropriate atomic operations.
634 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
635                                                LoadInst &Load) {
636   // FIXME: We could probably with some care handle both volatile and ordered
637   // atomic loads here but it isn't clear that this is important.
638   if (!Load.isUnordered())
639     return nullptr;
640 
641   if (Load.use_empty())
642     return nullptr;
643 
644   // swifterror values can't be bitcasted.
645   if (Load.getPointerOperand()->isSwiftError())
646     return nullptr;
647 
648   // Fold away bit casts of the loaded value by loading the desired type.
649   // Note that we should not do this for pointer<->integer casts,
650   // because that would result in type punning.
651   if (Load.hasOneUse()) {
652     // Don't transform when the type is x86_amx, it makes the pass that lower
653     // x86_amx type happy.
654     Type *LoadTy = Load.getType();
655     if (auto *BC = dyn_cast<BitCastInst>(Load.user_back())) {
656       assert(!LoadTy->isX86_AMXTy() && "Load from x86_amx* should not happen!");
657       if (BC->getType()->isX86_AMXTy())
658         return nullptr;
659     }
660 
661     if (auto *CastUser = dyn_cast<CastInst>(Load.user_back())) {
662       Type *DestTy = CastUser->getDestTy();
663       if (CastUser->isNoopCast(IC.getDataLayout()) &&
664           LoadTy->isPtrOrPtrVectorTy() == DestTy->isPtrOrPtrVectorTy() &&
665           (!Load.isAtomic() || isSupportedAtomicType(DestTy))) {
666         LoadInst *NewLoad = IC.combineLoadToNewType(Load, DestTy);
667         CastUser->replaceAllUsesWith(NewLoad);
668         IC.eraseInstFromFunction(*CastUser);
669         return &Load;
670       }
671     }
672   }
673 
674   // FIXME: We should also canonicalize loads of vectors when their elements are
675   // cast to other types.
676   return nullptr;
677 }
678 
679 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
680   // FIXME: We could probably with some care handle both volatile and atomic
681   // stores here but it isn't clear that this is important.
682   if (!LI.isSimple())
683     return nullptr;
684 
685   Type *T = LI.getType();
686   if (!T->isAggregateType())
687     return nullptr;
688 
689   StringRef Name = LI.getName();
690 
691   if (auto *ST = dyn_cast<StructType>(T)) {
692     // If the struct only have one element, we unpack.
693     auto NumElements = ST->getNumElements();
694     if (NumElements == 1) {
695       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
696                                                   ".unpack");
697       NewLoad->setAAMetadata(LI.getAAMetadata());
698       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
699         PoisonValue::get(T), NewLoad, 0, Name));
700     }
701 
702     // We don't want to break loads with padding here as we'd loose
703     // the knowledge that padding exists for the rest of the pipeline.
704     const DataLayout &DL = IC.getDataLayout();
705     auto *SL = DL.getStructLayout(ST);
706 
707     if (SL->hasPadding())
708       return nullptr;
709 
710     const auto Align = LI.getAlign();
711     auto *Addr = LI.getPointerOperand();
712     auto *IdxType = DL.getIndexType(Addr->getType());
713 
714     Value *V = PoisonValue::get(T);
715     for (unsigned i = 0; i < NumElements; i++) {
716       auto *Ptr = IC.Builder.CreateInBoundsPtrAdd(
717           Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)),
718           Name + ".elt");
719       auto *L = IC.Builder.CreateAlignedLoad(
720           ST->getElementType(i), Ptr,
721           commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()),
722           Name + ".unpack");
723       // Propagate AA metadata. It'll still be valid on the narrowed load.
724       L->setAAMetadata(LI.getAAMetadata());
725       V = IC.Builder.CreateInsertValue(V, L, i);
726     }
727 
728     V->setName(Name);
729     return IC.replaceInstUsesWith(LI, V);
730   }
731 
732   if (auto *AT = dyn_cast<ArrayType>(T)) {
733     auto *ET = AT->getElementType();
734     auto NumElements = AT->getNumElements();
735     if (NumElements == 1) {
736       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
737       NewLoad->setAAMetadata(LI.getAAMetadata());
738       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
739         PoisonValue::get(T), NewLoad, 0, Name));
740     }
741 
742     // Bail out if the array is too large. Ideally we would like to optimize
743     // arrays of arbitrary size but this has a terrible impact on compile time.
744     // The threshold here is chosen arbitrarily, maybe needs a little bit of
745     // tuning.
746     if (NumElements > IC.MaxArraySizeForCombine)
747       return nullptr;
748 
749     const DataLayout &DL = IC.getDataLayout();
750     TypeSize EltSize = DL.getTypeAllocSize(ET);
751     const auto Align = LI.getAlign();
752 
753     auto *Addr = LI.getPointerOperand();
754     auto *IdxType = Type::getInt64Ty(T->getContext());
755     auto *Zero = ConstantInt::get(IdxType, 0);
756 
757     Value *V = PoisonValue::get(T);
758     TypeSize Offset = TypeSize::getZero();
759     for (uint64_t i = 0; i < NumElements; i++) {
760       Value *Indices[2] = {
761         Zero,
762         ConstantInt::get(IdxType, i),
763       };
764       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices),
765                                                Name + ".elt");
766       auto EltAlign = commonAlignment(Align, Offset.getKnownMinValue());
767       auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
768                                              EltAlign, Name + ".unpack");
769       L->setAAMetadata(LI.getAAMetadata());
770       V = IC.Builder.CreateInsertValue(V, L, i);
771       Offset += EltSize;
772     }
773 
774     V->setName(Name);
775     return IC.replaceInstUsesWith(LI, V);
776   }
777 
778   return nullptr;
779 }
780 
781 // If we can determine that all possible objects pointed to by the provided
782 // pointer value are, not only dereferenceable, but also definitively less than
783 // or equal to the provided maximum size, then return true. Otherwise, return
784 // false (constant global values and allocas fall into this category).
785 //
786 // FIXME: This should probably live in ValueTracking (or similar).
787 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
788                                      const DataLayout &DL) {
789   SmallPtrSet<Value *, 4> Visited;
790   SmallVector<Value *, 4> Worklist(1, V);
791 
792   do {
793     Value *P = Worklist.pop_back_val();
794     P = P->stripPointerCasts();
795 
796     if (!Visited.insert(P).second)
797       continue;
798 
799     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
800       Worklist.push_back(SI->getTrueValue());
801       Worklist.push_back(SI->getFalseValue());
802       continue;
803     }
804 
805     if (PHINode *PN = dyn_cast<PHINode>(P)) {
806       append_range(Worklist, PN->incoming_values());
807       continue;
808     }
809 
810     if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
811       if (GA->isInterposable())
812         return false;
813       Worklist.push_back(GA->getAliasee());
814       continue;
815     }
816 
817     // If we know how big this object is, and it is less than MaxSize, continue
818     // searching. Otherwise, return false.
819     if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
820       if (!AI->getAllocatedType()->isSized())
821         return false;
822 
823       ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
824       if (!CS)
825         return false;
826 
827       TypeSize TS = DL.getTypeAllocSize(AI->getAllocatedType());
828       if (TS.isScalable())
829         return false;
830       // Make sure that, even if the multiplication below would wrap as an
831       // uint64_t, we still do the right thing.
832       if ((CS->getValue().zext(128) * APInt(128, TS.getFixedValue()))
833               .ugt(MaxSize))
834         return false;
835       continue;
836     }
837 
838     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
839       if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
840         return false;
841 
842       uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
843       if (InitSize > MaxSize)
844         return false;
845       continue;
846     }
847 
848     return false;
849   } while (!Worklist.empty());
850 
851   return true;
852 }
853 
854 // If we're indexing into an object of a known size, and the outer index is
855 // not a constant, but having any value but zero would lead to undefined
856 // behavior, replace it with zero.
857 //
858 // For example, if we have:
859 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
860 // ...
861 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
862 // ... = load i32* %arrayidx, align 4
863 // Then we know that we can replace %x in the GEP with i64 0.
864 //
865 // FIXME: We could fold any GEP index to zero that would cause UB if it were
866 // not zero. Currently, we only handle the first such index. Also, we could
867 // also search through non-zero constant indices if we kept track of the
868 // offsets those indices implied.
869 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
870                                      GetElementPtrInst *GEPI, Instruction *MemI,
871                                      unsigned &Idx) {
872   if (GEPI->getNumOperands() < 2)
873     return false;
874 
875   // Find the first non-zero index of a GEP. If all indices are zero, return
876   // one past the last index.
877   auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
878     unsigned I = 1;
879     for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
880       Value *V = GEPI->getOperand(I);
881       if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
882         if (CI->isZero())
883           continue;
884 
885       break;
886     }
887 
888     return I;
889   };
890 
891   // Skip through initial 'zero' indices, and find the corresponding pointer
892   // type. See if the next index is not a constant.
893   Idx = FirstNZIdx(GEPI);
894   if (Idx == GEPI->getNumOperands())
895     return false;
896   if (isa<Constant>(GEPI->getOperand(Idx)))
897     return false;
898 
899   SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
900   Type *SourceElementType = GEPI->getSourceElementType();
901   // Size information about scalable vectors is not available, so we cannot
902   // deduce whether indexing at n is undefined behaviour or not. Bail out.
903   if (SourceElementType->isScalableTy())
904     return false;
905 
906   Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
907   if (!AllocTy || !AllocTy->isSized())
908     return false;
909   const DataLayout &DL = IC.getDataLayout();
910   uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedValue();
911 
912   // If there are more indices after the one we might replace with a zero, make
913   // sure they're all non-negative. If any of them are negative, the overall
914   // address being computed might be before the base address determined by the
915   // first non-zero index.
916   auto IsAllNonNegative = [&]() {
917     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
918       KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
919       if (Known.isNonNegative())
920         continue;
921       return false;
922     }
923 
924     return true;
925   };
926 
927   // FIXME: If the GEP is not inbounds, and there are extra indices after the
928   // one we'll replace, those could cause the address computation to wrap
929   // (rendering the IsAllNonNegative() check below insufficient). We can do
930   // better, ignoring zero indices (and other indices we can prove small
931   // enough not to wrap).
932   if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
933     return false;
934 
935   // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
936   // also known to be dereferenceable.
937   return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
938          IsAllNonNegative();
939 }
940 
941 // If we're indexing into an object with a variable index for the memory
942 // access, but the object has only one element, we can assume that the index
943 // will always be zero. If we replace the GEP, return it.
944 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
945                                           Instruction &MemI) {
946   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
947     unsigned Idx;
948     if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
949       Instruction *NewGEPI = GEPI->clone();
950       NewGEPI->setOperand(Idx,
951         ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
952       IC.InsertNewInstBefore(NewGEPI, GEPI->getIterator());
953       return NewGEPI;
954     }
955   }
956 
957   return nullptr;
958 }
959 
960 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
961   if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
962     return false;
963 
964   auto *Ptr = SI.getPointerOperand();
965   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
966     Ptr = GEPI->getOperand(0);
967   return (isa<ConstantPointerNull>(Ptr) &&
968           !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
969 }
970 
971 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
972   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
973     const Value *GEPI0 = GEPI->getOperand(0);
974     if (isa<ConstantPointerNull>(GEPI0) &&
975         !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
976       return true;
977   }
978   if (isa<UndefValue>(Op) ||
979       (isa<ConstantPointerNull>(Op) &&
980        !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
981     return true;
982   return false;
983 }
984 
985 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
986   Value *Op = LI.getOperand(0);
987   if (Value *Res = simplifyLoadInst(&LI, Op, SQ.getWithInstruction(&LI)))
988     return replaceInstUsesWith(LI, Res);
989 
990   // Try to canonicalize the loaded type.
991   if (Instruction *Res = combineLoadToOperationType(*this, LI))
992     return Res;
993 
994   // Replace GEP indices if possible.
995   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI))
996     return replaceOperand(LI, 0, NewGEPI);
997 
998   if (Instruction *Res = unpackLoadToAggregate(*this, LI))
999     return Res;
1000 
1001   // Do really simple store-to-load forwarding and load CSE, to catch cases
1002   // where there are several consecutive memory accesses to the same location,
1003   // separated by a few arithmetic operations.
1004   bool IsLoadCSE = false;
1005   BatchAAResults BatchAA(*AA);
1006   if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
1007     if (IsLoadCSE)
1008       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
1009 
1010     return replaceInstUsesWith(
1011         LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
1012                                            LI.getName() + ".cast"));
1013   }
1014 
1015   // None of the following transforms are legal for volatile/ordered atomic
1016   // loads.  Most of them do apply for unordered atomics.
1017   if (!LI.isUnordered()) return nullptr;
1018 
1019   // load(gep null, ...) -> unreachable
1020   // load null/undef -> unreachable
1021   // TODO: Consider a target hook for valid address spaces for this xforms.
1022   if (canSimplifyNullLoadOrGEP(LI, Op)) {
1023     CreateNonTerminatorUnreachable(&LI);
1024     return replaceInstUsesWith(LI, PoisonValue::get(LI.getType()));
1025   }
1026 
1027   if (Op->hasOneUse()) {
1028     // Change select and PHI nodes to select values instead of addresses: this
1029     // helps alias analysis out a lot, allows many others simplifications, and
1030     // exposes redundancy in the code.
1031     //
1032     // Note that we cannot do the transformation unless we know that the
1033     // introduced loads cannot trap!  Something like this is valid as long as
1034     // the condition is always false: load (select bool %C, int* null, int* %G),
1035     // but it would not be valid if we transformed it to load from null
1036     // unconditionally.
1037     //
1038     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
1039       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
1040       Align Alignment = LI.getAlign();
1041       if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
1042                                       Alignment, DL, SI) &&
1043           isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
1044                                       Alignment, DL, SI)) {
1045         LoadInst *V1 =
1046             Builder.CreateLoad(LI.getType(), SI->getOperand(1),
1047                                SI->getOperand(1)->getName() + ".val");
1048         LoadInst *V2 =
1049             Builder.CreateLoad(LI.getType(), SI->getOperand(2),
1050                                SI->getOperand(2)->getName() + ".val");
1051         assert(LI.isUnordered() && "implied by above");
1052         V1->setAlignment(Alignment);
1053         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1054         V2->setAlignment(Alignment);
1055         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1056         // It is safe to copy any metadata that does not trigger UB. Copy any
1057         // poison-generating metadata.
1058         V1->copyMetadata(LI, Metadata::PoisonGeneratingIDs);
1059         V2->copyMetadata(LI, Metadata::PoisonGeneratingIDs);
1060         return SelectInst::Create(SI->getCondition(), V1, V2);
1061       }
1062 
1063       // load (select (cond, null, P)) -> load P
1064       if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1065           !NullPointerIsDefined(SI->getFunction(),
1066                                 LI.getPointerAddressSpace()))
1067         return replaceOperand(LI, 0, SI->getOperand(2));
1068 
1069       // load (select (cond, P, null)) -> load P
1070       if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1071           !NullPointerIsDefined(SI->getFunction(),
1072                                 LI.getPointerAddressSpace()))
1073         return replaceOperand(LI, 0, SI->getOperand(1));
1074     }
1075   }
1076   return nullptr;
1077 }
1078 
1079 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1080 ///
1081 /// \returns underlying value that was "cast", or nullptr otherwise.
1082 ///
1083 /// For example, if we have:
1084 ///
1085 ///     %E0 = extractelement <2 x double> %U, i32 0
1086 ///     %V0 = insertvalue [2 x double] undef, double %E0, 0
1087 ///     %E1 = extractelement <2 x double> %U, i32 1
1088 ///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
1089 ///
1090 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1091 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1092 /// Note that %U may contain non-undef values where %V1 has undef.
1093 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
1094   Value *U = nullptr;
1095   while (auto *IV = dyn_cast<InsertValueInst>(V)) {
1096     auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1097     if (!E)
1098       return nullptr;
1099     auto *W = E->getVectorOperand();
1100     if (!U)
1101       U = W;
1102     else if (U != W)
1103       return nullptr;
1104     auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1105     if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1106       return nullptr;
1107     V = IV->getAggregateOperand();
1108   }
1109   if (!match(V, m_Undef()) || !U)
1110     return nullptr;
1111 
1112   auto *UT = cast<VectorType>(U->getType());
1113   auto *VT = V->getType();
1114   // Check that types UT and VT are bitwise isomorphic.
1115   const auto &DL = IC.getDataLayout();
1116   if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1117     return nullptr;
1118   }
1119   if (auto *AT = dyn_cast<ArrayType>(VT)) {
1120     if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1121       return nullptr;
1122   } else {
1123     auto *ST = cast<StructType>(VT);
1124     if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1125       return nullptr;
1126     for (const auto *EltT : ST->elements()) {
1127       if (EltT != UT->getElementType())
1128         return nullptr;
1129     }
1130   }
1131   return U;
1132 }
1133 
1134 /// Combine stores to match the type of value being stored.
1135 ///
1136 /// The core idea here is that the memory does not have any intrinsic type and
1137 /// where we can we should match the type of a store to the type of value being
1138 /// stored.
1139 ///
1140 /// However, this routine must never change the width of a store or the number of
1141 /// stores as that would introduce a semantic change. This combine is expected to
1142 /// be a semantic no-op which just allows stores to more closely model the types
1143 /// of their incoming values.
1144 ///
1145 /// Currently, we also refuse to change the precise type used for an atomic or
1146 /// volatile store. This is debatable, and might be reasonable to change later.
1147 /// However, it is risky in case some backend or other part of LLVM is relying
1148 /// on the exact type stored to select appropriate atomic operations.
1149 ///
1150 /// \returns true if the store was successfully combined away. This indicates
1151 /// the caller must erase the store instruction. We have to let the caller erase
1152 /// the store instruction as otherwise there is no way to signal whether it was
1153 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
1154 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
1155   // FIXME: We could probably with some care handle both volatile and ordered
1156   // atomic stores here but it isn't clear that this is important.
1157   if (!SI.isUnordered())
1158     return false;
1159 
1160   // swifterror values can't be bitcasted.
1161   if (SI.getPointerOperand()->isSwiftError())
1162     return false;
1163 
1164   Value *V = SI.getValueOperand();
1165 
1166   // Fold away bit casts of the stored value by storing the original type.
1167   if (auto *BC = dyn_cast<BitCastInst>(V)) {
1168     assert(!BC->getType()->isX86_AMXTy() &&
1169            "store to x86_amx* should not happen!");
1170     V = BC->getOperand(0);
1171     // Don't transform when the type is x86_amx, it makes the pass that lower
1172     // x86_amx type happy.
1173     if (V->getType()->isX86_AMXTy())
1174       return false;
1175     if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1176       combineStoreToNewValue(IC, SI, V);
1177       return true;
1178     }
1179   }
1180 
1181   if (Value *U = likeBitCastFromVector(IC, V))
1182     if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1183       combineStoreToNewValue(IC, SI, U);
1184       return true;
1185     }
1186 
1187   // FIXME: We should also canonicalize stores of vectors when their elements
1188   // are cast to other types.
1189   return false;
1190 }
1191 
1192 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
1193   // FIXME: We could probably with some care handle both volatile and atomic
1194   // stores here but it isn't clear that this is important.
1195   if (!SI.isSimple())
1196     return false;
1197 
1198   Value *V = SI.getValueOperand();
1199   Type *T = V->getType();
1200 
1201   if (!T->isAggregateType())
1202     return false;
1203 
1204   if (auto *ST = dyn_cast<StructType>(T)) {
1205     // If the struct only have one element, we unpack.
1206     unsigned Count = ST->getNumElements();
1207     if (Count == 1) {
1208       V = IC.Builder.CreateExtractValue(V, 0);
1209       combineStoreToNewValue(IC, SI, V);
1210       return true;
1211     }
1212 
1213     // We don't want to break loads with padding here as we'd loose
1214     // the knowledge that padding exists for the rest of the pipeline.
1215     const DataLayout &DL = IC.getDataLayout();
1216     auto *SL = DL.getStructLayout(ST);
1217 
1218     if (SL->hasPadding())
1219       return false;
1220 
1221     const auto Align = SI.getAlign();
1222 
1223     SmallString<16> EltName = V->getName();
1224     EltName += ".elt";
1225     auto *Addr = SI.getPointerOperand();
1226     SmallString<16> AddrName = Addr->getName();
1227     AddrName += ".repack";
1228 
1229     auto *IdxType = DL.getIndexType(Addr->getType());
1230     for (unsigned i = 0; i < Count; i++) {
1231       auto *Ptr = IC.Builder.CreateInBoundsPtrAdd(
1232           Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)),
1233           AddrName);
1234       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1235       auto EltAlign =
1236           commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue());
1237       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1238       NS->setAAMetadata(SI.getAAMetadata());
1239     }
1240 
1241     return true;
1242   }
1243 
1244   if (auto *AT = dyn_cast<ArrayType>(T)) {
1245     // If the array only have one element, we unpack.
1246     auto NumElements = AT->getNumElements();
1247     if (NumElements == 1) {
1248       V = IC.Builder.CreateExtractValue(V, 0);
1249       combineStoreToNewValue(IC, SI, V);
1250       return true;
1251     }
1252 
1253     // Bail out if the array is too large. Ideally we would like to optimize
1254     // arrays of arbitrary size but this has a terrible impact on compile time.
1255     // The threshold here is chosen arbitrarily, maybe needs a little bit of
1256     // tuning.
1257     if (NumElements > IC.MaxArraySizeForCombine)
1258       return false;
1259 
1260     const DataLayout &DL = IC.getDataLayout();
1261     TypeSize EltSize = DL.getTypeAllocSize(AT->getElementType());
1262     const auto Align = SI.getAlign();
1263 
1264     SmallString<16> EltName = V->getName();
1265     EltName += ".elt";
1266     auto *Addr = SI.getPointerOperand();
1267     SmallString<16> AddrName = Addr->getName();
1268     AddrName += ".repack";
1269 
1270     auto *IdxType = Type::getInt64Ty(T->getContext());
1271     auto *Zero = ConstantInt::get(IdxType, 0);
1272 
1273     TypeSize Offset = TypeSize::getZero();
1274     for (uint64_t i = 0; i < NumElements; i++) {
1275       Value *Indices[2] = {
1276         Zero,
1277         ConstantInt::get(IdxType, i),
1278       };
1279       auto *Ptr =
1280           IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices), AddrName);
1281       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1282       auto EltAlign = commonAlignment(Align, Offset.getKnownMinValue());
1283       Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1284       NS->setAAMetadata(SI.getAAMetadata());
1285       Offset += EltSize;
1286     }
1287 
1288     return true;
1289   }
1290 
1291   return false;
1292 }
1293 
1294 /// equivalentAddressValues - Test if A and B will obviously have the same
1295 /// value. This includes recognizing that %t0 and %t1 will have the same
1296 /// value in code like this:
1297 ///   %t0 = getelementptr \@a, 0, 3
1298 ///   store i32 0, i32* %t0
1299 ///   %t1 = getelementptr \@a, 0, 3
1300 ///   %t2 = load i32* %t1
1301 ///
1302 static bool equivalentAddressValues(Value *A, Value *B) {
1303   // Test if the values are trivially equivalent.
1304   if (A == B) return true;
1305 
1306   // Test if the values come form identical arithmetic instructions.
1307   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1308   // its only used to compare two uses within the same basic block, which
1309   // means that they'll always either have the same value or one of them
1310   // will have an undefined value.
1311   if (isa<BinaryOperator>(A) ||
1312       isa<CastInst>(A) ||
1313       isa<PHINode>(A) ||
1314       isa<GetElementPtrInst>(A))
1315     if (Instruction *BI = dyn_cast<Instruction>(B))
1316       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1317         return true;
1318 
1319   // Otherwise they may not be equivalent.
1320   return false;
1321 }
1322 
1323 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
1324   Value *Val = SI.getOperand(0);
1325   Value *Ptr = SI.getOperand(1);
1326 
1327   // Try to canonicalize the stored type.
1328   if (combineStoreToValueType(*this, SI))
1329     return eraseInstFromFunction(SI);
1330 
1331   // Try to canonicalize the stored type.
1332   if (unpackStoreToAggregate(*this, SI))
1333     return eraseInstFromFunction(SI);
1334 
1335   // Replace GEP indices if possible.
1336   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI))
1337     return replaceOperand(SI, 1, NewGEPI);
1338 
1339   // Don't hack volatile/ordered stores.
1340   // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1341   if (!SI.isUnordered()) return nullptr;
1342 
1343   // If the RHS is an alloca with a single use, zapify the store, making the
1344   // alloca dead.
1345   if (Ptr->hasOneUse()) {
1346     if (isa<AllocaInst>(Ptr))
1347       return eraseInstFromFunction(SI);
1348     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1349       if (isa<AllocaInst>(GEP->getOperand(0))) {
1350         if (GEP->getOperand(0)->hasOneUse())
1351           return eraseInstFromFunction(SI);
1352       }
1353     }
1354   }
1355 
1356   // If we have a store to a location which is known constant, we can conclude
1357   // that the store must be storing the constant value (else the memory
1358   // wouldn't be constant), and this must be a noop.
1359   if (!isModSet(AA->getModRefInfoMask(Ptr)))
1360     return eraseInstFromFunction(SI);
1361 
1362   // Do really simple DSE, to catch cases where there are several consecutive
1363   // stores to the same location, separated by a few arithmetic operations. This
1364   // situation often occurs with bitfield accesses.
1365   BasicBlock::iterator BBI(SI);
1366   for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1367        --ScanInsts) {
1368     --BBI;
1369     // Don't count debug info directives, lest they affect codegen,
1370     // and we skip pointer-to-pointer bitcasts, which are NOPs.
1371     if (BBI->isDebugOrPseudoInst()) {
1372       ScanInsts++;
1373       continue;
1374     }
1375 
1376     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
1377       // Prev store isn't volatile, and stores to the same location?
1378       if (PrevSI->isUnordered() &&
1379           equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1)) &&
1380           PrevSI->getValueOperand()->getType() ==
1381               SI.getValueOperand()->getType()) {
1382         ++NumDeadStore;
1383         // Manually add back the original store to the worklist now, so it will
1384         // be processed after the operands of the removed store, as this may
1385         // expose additional DSE opportunities.
1386         Worklist.push(&SI);
1387         eraseInstFromFunction(*PrevSI);
1388         return nullptr;
1389       }
1390       break;
1391     }
1392 
1393     // If this is a load, we have to stop.  However, if the loaded value is from
1394     // the pointer we're loading and is producing the pointer we're storing,
1395     // then *this* store is dead (X = load P; store X -> P).
1396     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
1397       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1398         assert(SI.isUnordered() && "can't eliminate ordering operation");
1399         return eraseInstFromFunction(SI);
1400       }
1401 
1402       // Otherwise, this is a load from some other location.  Stores before it
1403       // may not be dead.
1404       break;
1405     }
1406 
1407     // Don't skip over loads, throws or things that can modify memory.
1408     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1409       break;
1410   }
1411 
1412   // store X, null    -> turns into 'unreachable' in SimplifyCFG
1413   // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1414   if (canSimplifyNullStoreOrGEP(SI)) {
1415     if (!isa<PoisonValue>(Val))
1416       return replaceOperand(SI, 0, PoisonValue::get(Val->getType()));
1417     return nullptr;  // Do not modify these!
1418   }
1419 
1420   // This is a non-terminator unreachable marker. Don't remove it.
1421   if (isa<UndefValue>(Ptr)) {
1422     // Remove guaranteed-to-transfer instructions before the marker.
1423     if (removeInstructionsBeforeUnreachable(SI))
1424       return &SI;
1425 
1426     // Remove all instructions after the marker and handle dead blocks this
1427     // implies.
1428     SmallVector<BasicBlock *> Worklist;
1429     handleUnreachableFrom(SI.getNextNode(), Worklist);
1430     handlePotentiallyDeadBlocks(Worklist);
1431     return nullptr;
1432   }
1433 
1434   // store undef, Ptr -> noop
1435   // FIXME: This is technically incorrect because it might overwrite a poison
1436   // value. Change to PoisonValue once #52930 is resolved.
1437   if (isa<UndefValue>(Val))
1438     return eraseInstFromFunction(SI);
1439 
1440   return nullptr;
1441 }
1442 
1443 /// Try to transform:
1444 ///   if () { *P = v1; } else { *P = v2 }
1445 /// or:
1446 ///   *P = v1; if () { *P = v2; }
1447 /// into a phi node with a store in the successor.
1448 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
1449   if (!SI.isUnordered())
1450     return false; // This code has not been audited for volatile/ordered case.
1451 
1452   // Check if the successor block has exactly 2 incoming edges.
1453   BasicBlock *StoreBB = SI.getParent();
1454   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
1455   if (!DestBB->hasNPredecessors(2))
1456     return false;
1457 
1458   // Capture the other block (the block that doesn't contain our store).
1459   pred_iterator PredIter = pred_begin(DestBB);
1460   if (*PredIter == StoreBB)
1461     ++PredIter;
1462   BasicBlock *OtherBB = *PredIter;
1463 
1464   // Bail out if all of the relevant blocks aren't distinct. This can happen,
1465   // for example, if SI is in an infinite loop.
1466   if (StoreBB == DestBB || OtherBB == DestBB)
1467     return false;
1468 
1469   // Verify that the other block ends in a branch and is not otherwise empty.
1470   BasicBlock::iterator BBI(OtherBB->getTerminator());
1471   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
1472   if (!OtherBr || BBI == OtherBB->begin())
1473     return false;
1474 
1475   auto OtherStoreIsMergeable = [&](StoreInst *OtherStore) -> bool {
1476     if (!OtherStore ||
1477         OtherStore->getPointerOperand() != SI.getPointerOperand())
1478       return false;
1479 
1480     auto *SIVTy = SI.getValueOperand()->getType();
1481     auto *OSVTy = OtherStore->getValueOperand()->getType();
1482     return CastInst::isBitOrNoopPointerCastable(OSVTy, SIVTy, DL) &&
1483            SI.hasSameSpecialState(OtherStore);
1484   };
1485 
1486   // If the other block ends in an unconditional branch, check for the 'if then
1487   // else' case. There is an instruction before the branch.
1488   StoreInst *OtherStore = nullptr;
1489   if (OtherBr->isUnconditional()) {
1490     --BBI;
1491     // Skip over debugging info and pseudo probes.
1492     while (BBI->isDebugOrPseudoInst()) {
1493       if (BBI==OtherBB->begin())
1494         return false;
1495       --BBI;
1496     }
1497     // If this isn't a store, isn't a store to the same location, or is not the
1498     // right kind of store, bail out.
1499     OtherStore = dyn_cast<StoreInst>(BBI);
1500     if (!OtherStoreIsMergeable(OtherStore))
1501       return false;
1502   } else {
1503     // Otherwise, the other block ended with a conditional branch. If one of the
1504     // destinations is StoreBB, then we have the if/then case.
1505     if (OtherBr->getSuccessor(0) != StoreBB &&
1506         OtherBr->getSuccessor(1) != StoreBB)
1507       return false;
1508 
1509     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1510     // if/then triangle. See if there is a store to the same ptr as SI that
1511     // lives in OtherBB.
1512     for (;; --BBI) {
1513       // Check to see if we find the matching store.
1514       OtherStore = dyn_cast<StoreInst>(BBI);
1515       if (OtherStoreIsMergeable(OtherStore))
1516         break;
1517 
1518       // If we find something that may be using or overwriting the stored
1519       // value, or if we run out of instructions, we can't do the transform.
1520       if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1521           BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1522         return false;
1523     }
1524 
1525     // In order to eliminate the store in OtherBr, we have to make sure nothing
1526     // reads or overwrites the stored value in StoreBB.
1527     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1528       // FIXME: This should really be AA driven.
1529       if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1530         return false;
1531     }
1532   }
1533 
1534   // Insert a PHI node now if we need it.
1535   Value *MergedVal = OtherStore->getValueOperand();
1536   // The debug locations of the original instructions might differ. Merge them.
1537   DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
1538                                                      OtherStore->getDebugLoc());
1539   if (MergedVal != SI.getValueOperand()) {
1540     PHINode *PN =
1541         PHINode::Create(SI.getValueOperand()->getType(), 2, "storemerge");
1542     PN->addIncoming(SI.getValueOperand(), SI.getParent());
1543     Builder.SetInsertPoint(OtherStore);
1544     PN->addIncoming(Builder.CreateBitOrPointerCast(MergedVal, PN->getType()),
1545                     OtherBB);
1546     MergedVal = InsertNewInstBefore(PN, DestBB->begin());
1547     PN->setDebugLoc(MergedLoc);
1548   }
1549 
1550   // Advance to a place where it is safe to insert the new store and insert it.
1551   BBI = DestBB->getFirstInsertionPt();
1552   StoreInst *NewSI =
1553       new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
1554                     SI.getOrdering(), SI.getSyncScopeID());
1555   InsertNewInstBefore(NewSI, BBI);
1556   NewSI->setDebugLoc(MergedLoc);
1557   NewSI->mergeDIAssignID({&SI, OtherStore});
1558 
1559   // If the two stores had AA tags, merge them.
1560   AAMDNodes AATags = SI.getAAMetadata();
1561   if (AATags)
1562     NewSI->setAAMetadata(AATags.merge(OtherStore->getAAMetadata()));
1563 
1564   // Nuke the old stores.
1565   eraseInstFromFunction(SI);
1566   eraseInstFromFunction(*OtherStore);
1567   return true;
1568 }
1569