xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp (revision 5deeebd8c6ca991269e72902a7a62cada57947f6)
15ffd83dbSDimitry Andric //===------- VectorCombine.cpp - Optimize partial vector operations -------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass optimizes scalar/vector interactions using target cost models. The
105ffd83dbSDimitry Andric // transforms implemented here may not fit in traditional loop-based or SLP
115ffd83dbSDimitry Andric // vectorization passes.
125ffd83dbSDimitry Andric //
135ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
145ffd83dbSDimitry Andric 
155ffd83dbSDimitry Andric #include "llvm/Transforms/Vectorize/VectorCombine.h"
165f757f3fSDimitry Andric #include "llvm/ADT/DenseMap.h"
170fca6ea1SDimitry Andric #include "llvm/ADT/STLExtras.h"
185f757f3fSDimitry Andric #include "llvm/ADT/ScopeExit.h"
195ffd83dbSDimitry Andric #include "llvm/ADT/Statistic.h"
20fe6060f1SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
215ffd83dbSDimitry Andric #include "llvm/Analysis/BasicAliasAnalysis.h"
225ffd83dbSDimitry Andric #include "llvm/Analysis/GlobalsModRef.h"
23e8d8bef9SDimitry Andric #include "llvm/Analysis/Loads.h"
245ffd83dbSDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
255ffd83dbSDimitry Andric #include "llvm/Analysis/ValueTracking.h"
265ffd83dbSDimitry Andric #include "llvm/Analysis/VectorUtils.h"
275ffd83dbSDimitry Andric #include "llvm/IR/Dominators.h"
285ffd83dbSDimitry Andric #include "llvm/IR/Function.h"
295ffd83dbSDimitry Andric #include "llvm/IR/IRBuilder.h"
305ffd83dbSDimitry Andric #include "llvm/IR/PatternMatch.h"
315ffd83dbSDimitry Andric #include "llvm/Support/CommandLine.h"
325ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/Local.h"
330fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
34bdd1243dSDimitry Andric #include <numeric>
355f757f3fSDimitry Andric #include <queue>
365ffd83dbSDimitry Andric 
37349cc55cSDimitry Andric #define DEBUG_TYPE "vector-combine"
38349cc55cSDimitry Andric #include "llvm/Transforms/Utils/InstructionWorklist.h"
39349cc55cSDimitry Andric 
405ffd83dbSDimitry Andric using namespace llvm;
415ffd83dbSDimitry Andric using namespace llvm::PatternMatch;
425ffd83dbSDimitry Andric 
43e8d8bef9SDimitry Andric STATISTIC(NumVecLoad, "Number of vector loads formed");
445ffd83dbSDimitry Andric STATISTIC(NumVecCmp, "Number of vector compares formed");
455ffd83dbSDimitry Andric STATISTIC(NumVecBO, "Number of vector binops formed");
465ffd83dbSDimitry Andric STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
475ffd83dbSDimitry Andric STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
485ffd83dbSDimitry Andric STATISTIC(NumScalarBO, "Number of scalar binops formed");
495ffd83dbSDimitry Andric STATISTIC(NumScalarCmp, "Number of scalar compares formed");
505ffd83dbSDimitry Andric 
515ffd83dbSDimitry Andric static cl::opt<bool> DisableVectorCombine(
525ffd83dbSDimitry Andric     "disable-vector-combine", cl::init(false), cl::Hidden,
535ffd83dbSDimitry Andric     cl::desc("Disable all vector combine transforms"));
545ffd83dbSDimitry Andric 
555ffd83dbSDimitry Andric static cl::opt<bool> DisableBinopExtractShuffle(
565ffd83dbSDimitry Andric     "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
575ffd83dbSDimitry Andric     cl::desc("Disable binop extract to shuffle transforms"));
585ffd83dbSDimitry Andric 
59fe6060f1SDimitry Andric static cl::opt<unsigned> MaxInstrsToScan(
60fe6060f1SDimitry Andric     "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
61fe6060f1SDimitry Andric     cl::desc("Max number of instructions to scan for vector combining."));
62fe6060f1SDimitry Andric 
635ffd83dbSDimitry Andric static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
645ffd83dbSDimitry Andric 
655ffd83dbSDimitry Andric namespace {
665ffd83dbSDimitry Andric class VectorCombine {
675ffd83dbSDimitry Andric public:
685ffd83dbSDimitry Andric   VectorCombine(Function &F, const TargetTransformInfo &TTI,
69349cc55cSDimitry Andric                 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
700fca6ea1SDimitry Andric                 const DataLayout *DL, bool TryEarlyFoldsOnly)
710fca6ea1SDimitry Andric       : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
72bdd1243dSDimitry Andric         TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
735ffd83dbSDimitry Andric 
745ffd83dbSDimitry Andric   bool run();
755ffd83dbSDimitry Andric 
765ffd83dbSDimitry Andric private:
775ffd83dbSDimitry Andric   Function &F;
785ffd83dbSDimitry Andric   IRBuilder<> Builder;
795ffd83dbSDimitry Andric   const TargetTransformInfo &TTI;
805ffd83dbSDimitry Andric   const DominatorTree &DT;
81fe6060f1SDimitry Andric   AAResults &AA;
82fe6060f1SDimitry Andric   AssumptionCache &AC;
830fca6ea1SDimitry Andric   const DataLayout *DL;
845ffd83dbSDimitry Andric 
85bdd1243dSDimitry Andric   /// If true, only perform beneficial early IR transforms. Do not introduce new
86349cc55cSDimitry Andric   /// vector operations.
87bdd1243dSDimitry Andric   bool TryEarlyFoldsOnly;
88349cc55cSDimitry Andric 
89349cc55cSDimitry Andric   InstructionWorklist Worklist;
90349cc55cSDimitry Andric 
91bdd1243dSDimitry Andric   // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
92bdd1243dSDimitry Andric   //       parameter. That should be updated to specific sub-classes because the
93bdd1243dSDimitry Andric   //       run loop was changed to dispatch on opcode.
94e8d8bef9SDimitry Andric   bool vectorizeLoadInsert(Instruction &I);
95bdd1243dSDimitry Andric   bool widenSubvectorLoad(Instruction &I);
965ffd83dbSDimitry Andric   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
975ffd83dbSDimitry Andric                                         ExtractElementInst *Ext1,
985ffd83dbSDimitry Andric                                         unsigned PreferredExtractIndex) const;
995ffd83dbSDimitry Andric   bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
100349cc55cSDimitry Andric                              const Instruction &I,
1015ffd83dbSDimitry Andric                              ExtractElementInst *&ConvertToShuffle,
1025ffd83dbSDimitry Andric                              unsigned PreferredExtractIndex);
1035ffd83dbSDimitry Andric   void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
1045ffd83dbSDimitry Andric                      Instruction &I);
1055ffd83dbSDimitry Andric   void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
1065ffd83dbSDimitry Andric                        Instruction &I);
1075ffd83dbSDimitry Andric   bool foldExtractExtract(Instruction &I);
108bdd1243dSDimitry Andric   bool foldInsExtFNeg(Instruction &I);
1095f757f3fSDimitry Andric   bool foldBitcastShuffle(Instruction &I);
1105ffd83dbSDimitry Andric   bool scalarizeBinopOrCmp(Instruction &I);
1115f757f3fSDimitry Andric   bool scalarizeVPIntrinsic(Instruction &I);
1125ffd83dbSDimitry Andric   bool foldExtractedCmps(Instruction &I);
113fe6060f1SDimitry Andric   bool foldSingleElementStore(Instruction &I);
114fe6060f1SDimitry Andric   bool scalarizeLoadExtract(Instruction &I);
115349cc55cSDimitry Andric   bool foldShuffleOfBinops(Instruction &I);
1160fca6ea1SDimitry Andric   bool foldShuffleOfCastops(Instruction &I);
1170fca6ea1SDimitry Andric   bool foldShuffleOfShuffles(Instruction &I);
1180fca6ea1SDimitry Andric   bool foldShuffleToIdentity(Instruction &I);
11981ad6265SDimitry Andric   bool foldShuffleFromReductions(Instruction &I);
1200fca6ea1SDimitry Andric   bool foldCastFromReductions(Instruction &I);
12181ad6265SDimitry Andric   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
1225ffd83dbSDimitry Andric 
123349cc55cSDimitry Andric   void replaceValue(Value &Old, Value &New) {
1245ffd83dbSDimitry Andric     Old.replaceAllUsesWith(&New);
125349cc55cSDimitry Andric     if (auto *NewI = dyn_cast<Instruction>(&New)) {
12681ad6265SDimitry Andric       New.takeName(&Old);
127349cc55cSDimitry Andric       Worklist.pushUsersToWorkList(*NewI);
128349cc55cSDimitry Andric       Worklist.pushValue(NewI);
1295ffd83dbSDimitry Andric     }
130349cc55cSDimitry Andric     Worklist.pushValue(&Old);
131349cc55cSDimitry Andric   }
132349cc55cSDimitry Andric 
133349cc55cSDimitry Andric   void eraseInstruction(Instruction &I) {
134349cc55cSDimitry Andric     for (Value *Op : I.operands())
135349cc55cSDimitry Andric       Worklist.pushValue(Op);
136349cc55cSDimitry Andric     Worklist.remove(&I);
137349cc55cSDimitry Andric     I.eraseFromParent();
138349cc55cSDimitry Andric   }
139349cc55cSDimitry Andric };
140349cc55cSDimitry Andric } // namespace
1415ffd83dbSDimitry Andric 
1420fca6ea1SDimitry Andric /// Return the source operand of a potentially bitcasted value. If there is no
1430fca6ea1SDimitry Andric /// bitcast, return the input value itself.
1440fca6ea1SDimitry Andric static Value *peekThroughBitcasts(Value *V) {
1450fca6ea1SDimitry Andric   while (auto *BitCast = dyn_cast<BitCastInst>(V))
1460fca6ea1SDimitry Andric     V = BitCast->getOperand(0);
1470fca6ea1SDimitry Andric   return V;
1480fca6ea1SDimitry Andric }
1490fca6ea1SDimitry Andric 
150bdd1243dSDimitry Andric static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
151bdd1243dSDimitry Andric   // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
152bdd1243dSDimitry Andric   // The widened load may load data from dirty regions or create data races
153bdd1243dSDimitry Andric   // non-existent in the source.
154bdd1243dSDimitry Andric   if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
155bdd1243dSDimitry Andric       Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
156bdd1243dSDimitry Andric       mustSuppressSpeculation(*Load))
157bdd1243dSDimitry Andric     return false;
158bdd1243dSDimitry Andric 
159bdd1243dSDimitry Andric   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
160bdd1243dSDimitry Andric   // sure we have all of our type-based constraints in place for this target.
161bdd1243dSDimitry Andric   Type *ScalarTy = Load->getType()->getScalarType();
162bdd1243dSDimitry Andric   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
163bdd1243dSDimitry Andric   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
164bdd1243dSDimitry Andric   if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
165bdd1243dSDimitry Andric       ScalarSize % 8 != 0)
166bdd1243dSDimitry Andric     return false;
167bdd1243dSDimitry Andric 
168bdd1243dSDimitry Andric   return true;
169bdd1243dSDimitry Andric }
170bdd1243dSDimitry Andric 
171e8d8bef9SDimitry Andric bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
172e8d8bef9SDimitry Andric   // Match insert into fixed vector of scalar value.
173e8d8bef9SDimitry Andric   // TODO: Handle non-zero insert index.
174e8d8bef9SDimitry Andric   Value *Scalar;
175bdd1243dSDimitry Andric   if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
176e8d8bef9SDimitry Andric       !Scalar->hasOneUse())
177e8d8bef9SDimitry Andric     return false;
178e8d8bef9SDimitry Andric 
179e8d8bef9SDimitry Andric   // Optionally match an extract from another vector.
180e8d8bef9SDimitry Andric   Value *X;
181e8d8bef9SDimitry Andric   bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
182e8d8bef9SDimitry Andric   if (!HasExtract)
183e8d8bef9SDimitry Andric     X = Scalar;
184e8d8bef9SDimitry Andric 
185e8d8bef9SDimitry Andric   auto *Load = dyn_cast<LoadInst>(X);
186bdd1243dSDimitry Andric   if (!canWidenLoad(Load, TTI))
187e8d8bef9SDimitry Andric     return false;
188e8d8bef9SDimitry Andric 
189e8d8bef9SDimitry Andric   Type *ScalarTy = Scalar->getType();
190e8d8bef9SDimitry Andric   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
191e8d8bef9SDimitry Andric   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
192e8d8bef9SDimitry Andric 
193e8d8bef9SDimitry Andric   // Check safety of replacing the scalar load with a larger vector load.
194e8d8bef9SDimitry Andric   // We use minimal alignment (maximum flexibility) because we only care about
195e8d8bef9SDimitry Andric   // the dereferenceable region. When calculating cost and creating a new op,
196e8d8bef9SDimitry Andric   // we may use a larger value based on alignment attributes.
197bdd1243dSDimitry Andric   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
198bdd1243dSDimitry Andric   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
199bdd1243dSDimitry Andric 
200e8d8bef9SDimitry Andric   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
201e8d8bef9SDimitry Andric   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
202e8d8bef9SDimitry Andric   unsigned OffsetEltIndex = 0;
203e8d8bef9SDimitry Andric   Align Alignment = Load->getAlign();
2040fca6ea1SDimitry Andric   if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
205bdd1243dSDimitry Andric                                    &DT)) {
206e8d8bef9SDimitry Andric     // It is not safe to load directly from the pointer, but we can still peek
207e8d8bef9SDimitry Andric     // through gep offsets and check if it safe to load from a base address with
208e8d8bef9SDimitry Andric     // updated alignment. If it is, we can shuffle the element(s) into place
209e8d8bef9SDimitry Andric     // after loading.
2100fca6ea1SDimitry Andric     unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
211e8d8bef9SDimitry Andric     APInt Offset(OffsetBitWidth, 0);
2120fca6ea1SDimitry Andric     SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
213e8d8bef9SDimitry Andric 
214e8d8bef9SDimitry Andric     // We want to shuffle the result down from a high element of a vector, so
215e8d8bef9SDimitry Andric     // the offset must be positive.
216e8d8bef9SDimitry Andric     if (Offset.isNegative())
217e8d8bef9SDimitry Andric       return false;
218e8d8bef9SDimitry Andric 
219e8d8bef9SDimitry Andric     // The offset must be a multiple of the scalar element to shuffle cleanly
220e8d8bef9SDimitry Andric     // in the element's size.
221e8d8bef9SDimitry Andric     uint64_t ScalarSizeInBytes = ScalarSize / 8;
222e8d8bef9SDimitry Andric     if (Offset.urem(ScalarSizeInBytes) != 0)
223e8d8bef9SDimitry Andric       return false;
224e8d8bef9SDimitry Andric 
225e8d8bef9SDimitry Andric     // If we load MinVecNumElts, will our target element still be loaded?
226e8d8bef9SDimitry Andric     OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
227e8d8bef9SDimitry Andric     if (OffsetEltIndex >= MinVecNumElts)
228e8d8bef9SDimitry Andric       return false;
229e8d8bef9SDimitry Andric 
2300fca6ea1SDimitry Andric     if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
231bdd1243dSDimitry Andric                                      &DT))
232e8d8bef9SDimitry Andric       return false;
233e8d8bef9SDimitry Andric 
234e8d8bef9SDimitry Andric     // Update alignment with offset value. Note that the offset could be negated
235e8d8bef9SDimitry Andric     // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
236e8d8bef9SDimitry Andric     // negation does not change the result of the alignment calculation.
237e8d8bef9SDimitry Andric     Alignment = commonAlignment(Alignment, Offset.getZExtValue());
238e8d8bef9SDimitry Andric   }
239e8d8bef9SDimitry Andric 
240e8d8bef9SDimitry Andric   // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
241e8d8bef9SDimitry Andric   // Use the greater of the alignment on the load or its source pointer.
2420fca6ea1SDimitry Andric   Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
243e8d8bef9SDimitry Andric   Type *LoadTy = Load->getType();
244bdd1243dSDimitry Andric   unsigned AS = Load->getPointerAddressSpace();
245e8d8bef9SDimitry Andric   InstructionCost OldCost =
246e8d8bef9SDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
247e8d8bef9SDimitry Andric   APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
248bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
249bdd1243dSDimitry Andric   OldCost +=
250bdd1243dSDimitry Andric       TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
251bdd1243dSDimitry Andric                                    /* Insert */ true, HasExtract, CostKind);
252e8d8bef9SDimitry Andric 
253e8d8bef9SDimitry Andric   // New pattern: load VecPtr
254e8d8bef9SDimitry Andric   InstructionCost NewCost =
255e8d8bef9SDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
256e8d8bef9SDimitry Andric   // Optionally, we are shuffling the loaded vector element(s) into place.
257fe6060f1SDimitry Andric   // For the mask set everything but element 0 to undef to prevent poison from
258fe6060f1SDimitry Andric   // propagating from the extra loaded memory. This will also optionally
259fe6060f1SDimitry Andric   // shrink/grow the vector from the loaded size to the output size.
260fe6060f1SDimitry Andric   // We assume this operation has no cost in codegen if there was no offset.
261fe6060f1SDimitry Andric   // Note that we could use freeze to avoid poison problems, but then we might
262fe6060f1SDimitry Andric   // still need a shuffle to change the vector size.
263bdd1243dSDimitry Andric   auto *Ty = cast<FixedVectorType>(I.getType());
264fe6060f1SDimitry Andric   unsigned OutputNumElts = Ty->getNumElements();
26506c3fb27SDimitry Andric   SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
266fe6060f1SDimitry Andric   assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
267fe6060f1SDimitry Andric   Mask[0] = OffsetEltIndex;
268e8d8bef9SDimitry Andric   if (OffsetEltIndex)
269fe6060f1SDimitry Andric     NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
270e8d8bef9SDimitry Andric 
271e8d8bef9SDimitry Andric   // We can aggressively convert to the vector form because the backend can
272e8d8bef9SDimitry Andric   // invert this transform if it does not result in a performance win.
273e8d8bef9SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
274e8d8bef9SDimitry Andric     return false;
275e8d8bef9SDimitry Andric 
276e8d8bef9SDimitry Andric   // It is safe and potentially profitable to load a vector directly:
277e8d8bef9SDimitry Andric   // inselt undef, load Scalar, 0 --> load VecPtr
278e8d8bef9SDimitry Andric   IRBuilder<> Builder(Load);
2795f757f3fSDimitry Andric   Value *CastedPtr =
2805f757f3fSDimitry Andric       Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
281e8d8bef9SDimitry Andric   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
282e8d8bef9SDimitry Andric   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
283e8d8bef9SDimitry Andric 
284e8d8bef9SDimitry Andric   replaceValue(I, *VecLd);
285e8d8bef9SDimitry Andric   ++NumVecLoad;
286e8d8bef9SDimitry Andric   return true;
287e8d8bef9SDimitry Andric }
288e8d8bef9SDimitry Andric 
289bdd1243dSDimitry Andric /// If we are loading a vector and then inserting it into a larger vector with
290bdd1243dSDimitry Andric /// undefined elements, try to load the larger vector and eliminate the insert.
291bdd1243dSDimitry Andric /// This removes a shuffle in IR and may allow combining of other loaded values.
292bdd1243dSDimitry Andric bool VectorCombine::widenSubvectorLoad(Instruction &I) {
293bdd1243dSDimitry Andric   // Match subvector insert of fixed vector.
294bdd1243dSDimitry Andric   auto *Shuf = cast<ShuffleVectorInst>(&I);
295bdd1243dSDimitry Andric   if (!Shuf->isIdentityWithPadding())
296bdd1243dSDimitry Andric     return false;
297bdd1243dSDimitry Andric 
298bdd1243dSDimitry Andric   // Allow a non-canonical shuffle mask that is choosing elements from op1.
299bdd1243dSDimitry Andric   unsigned NumOpElts =
300bdd1243dSDimitry Andric       cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
301bdd1243dSDimitry Andric   unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
302bdd1243dSDimitry Andric     return M >= (int)(NumOpElts);
303bdd1243dSDimitry Andric   });
304bdd1243dSDimitry Andric 
305bdd1243dSDimitry Andric   auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
306bdd1243dSDimitry Andric   if (!canWidenLoad(Load, TTI))
307bdd1243dSDimitry Andric     return false;
308bdd1243dSDimitry Andric 
309bdd1243dSDimitry Andric   // We use minimal alignment (maximum flexibility) because we only care about
310bdd1243dSDimitry Andric   // the dereferenceable region. When calculating cost and creating a new op,
311bdd1243dSDimitry Andric   // we may use a larger value based on alignment attributes.
312bdd1243dSDimitry Andric   auto *Ty = cast<FixedVectorType>(I.getType());
313bdd1243dSDimitry Andric   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
314bdd1243dSDimitry Andric   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
315bdd1243dSDimitry Andric   Align Alignment = Load->getAlign();
3160fca6ea1SDimitry Andric   if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
317bdd1243dSDimitry Andric     return false;
318bdd1243dSDimitry Andric 
3190fca6ea1SDimitry Andric   Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
320bdd1243dSDimitry Andric   Type *LoadTy = Load->getType();
321bdd1243dSDimitry Andric   unsigned AS = Load->getPointerAddressSpace();
322bdd1243dSDimitry Andric 
323bdd1243dSDimitry Andric   // Original pattern: insert_subvector (load PtrOp)
324bdd1243dSDimitry Andric   // This conservatively assumes that the cost of a subvector insert into an
325bdd1243dSDimitry Andric   // undef value is 0. We could add that cost if the cost model accurately
326bdd1243dSDimitry Andric   // reflects the real cost of that operation.
327bdd1243dSDimitry Andric   InstructionCost OldCost =
328bdd1243dSDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
329bdd1243dSDimitry Andric 
330bdd1243dSDimitry Andric   // New pattern: load PtrOp
331bdd1243dSDimitry Andric   InstructionCost NewCost =
332bdd1243dSDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
333bdd1243dSDimitry Andric 
334bdd1243dSDimitry Andric   // We can aggressively convert to the vector form because the backend can
335bdd1243dSDimitry Andric   // invert this transform if it does not result in a performance win.
336bdd1243dSDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
337bdd1243dSDimitry Andric     return false;
338bdd1243dSDimitry Andric 
339bdd1243dSDimitry Andric   IRBuilder<> Builder(Load);
340bdd1243dSDimitry Andric   Value *CastedPtr =
3415f757f3fSDimitry Andric       Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
342bdd1243dSDimitry Andric   Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
343bdd1243dSDimitry Andric   replaceValue(I, *VecLd);
344bdd1243dSDimitry Andric   ++NumVecLoad;
345bdd1243dSDimitry Andric   return true;
346bdd1243dSDimitry Andric }
347bdd1243dSDimitry Andric 
3485ffd83dbSDimitry Andric /// Determine which, if any, of the inputs should be replaced by a shuffle
3495ffd83dbSDimitry Andric /// followed by extract from a different index.
3505ffd83dbSDimitry Andric ExtractElementInst *VectorCombine::getShuffleExtract(
3515ffd83dbSDimitry Andric     ExtractElementInst *Ext0, ExtractElementInst *Ext1,
3525ffd83dbSDimitry Andric     unsigned PreferredExtractIndex = InvalidIndex) const {
35381ad6265SDimitry Andric   auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
35481ad6265SDimitry Andric   auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
35581ad6265SDimitry Andric   assert(Index0C && Index1C && "Expected constant extract indexes");
3565ffd83dbSDimitry Andric 
35781ad6265SDimitry Andric   unsigned Index0 = Index0C->getZExtValue();
35881ad6265SDimitry Andric   unsigned Index1 = Index1C->getZExtValue();
3595ffd83dbSDimitry Andric 
3605ffd83dbSDimitry Andric   // If the extract indexes are identical, no shuffle is needed.
3615ffd83dbSDimitry Andric   if (Index0 == Index1)
3625ffd83dbSDimitry Andric     return nullptr;
3635ffd83dbSDimitry Andric 
3645ffd83dbSDimitry Andric   Type *VecTy = Ext0->getVectorOperand()->getType();
365bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3665ffd83dbSDimitry Andric   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
367e8d8bef9SDimitry Andric   InstructionCost Cost0 =
368bdd1243dSDimitry Andric       TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
369e8d8bef9SDimitry Andric   InstructionCost Cost1 =
370bdd1243dSDimitry Andric       TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
371e8d8bef9SDimitry Andric 
372e8d8bef9SDimitry Andric   // If both costs are invalid no shuffle is needed
373e8d8bef9SDimitry Andric   if (!Cost0.isValid() && !Cost1.isValid())
374e8d8bef9SDimitry Andric     return nullptr;
3755ffd83dbSDimitry Andric 
3765ffd83dbSDimitry Andric   // We are extracting from 2 different indexes, so one operand must be shuffled
3775ffd83dbSDimitry Andric   // before performing a vector operation and/or extract. The more expensive
3785ffd83dbSDimitry Andric   // extract will be replaced by a shuffle.
3795ffd83dbSDimitry Andric   if (Cost0 > Cost1)
3805ffd83dbSDimitry Andric     return Ext0;
3815ffd83dbSDimitry Andric   if (Cost1 > Cost0)
3825ffd83dbSDimitry Andric     return Ext1;
3835ffd83dbSDimitry Andric 
3845ffd83dbSDimitry Andric   // If the costs are equal and there is a preferred extract index, shuffle the
3855ffd83dbSDimitry Andric   // opposite operand.
3865ffd83dbSDimitry Andric   if (PreferredExtractIndex == Index0)
3875ffd83dbSDimitry Andric     return Ext1;
3885ffd83dbSDimitry Andric   if (PreferredExtractIndex == Index1)
3895ffd83dbSDimitry Andric     return Ext0;
3905ffd83dbSDimitry Andric 
3915ffd83dbSDimitry Andric   // Otherwise, replace the extract with the higher index.
3925ffd83dbSDimitry Andric   return Index0 > Index1 ? Ext0 : Ext1;
3935ffd83dbSDimitry Andric }
3945ffd83dbSDimitry Andric 
3955ffd83dbSDimitry Andric /// Compare the relative costs of 2 extracts followed by scalar operation vs.
3965ffd83dbSDimitry Andric /// vector operation(s) followed by extract. Return true if the existing
3975ffd83dbSDimitry Andric /// instructions are cheaper than a vector alternative. Otherwise, return false
3985ffd83dbSDimitry Andric /// and if one of the extracts should be transformed to a shufflevector, set
3995ffd83dbSDimitry Andric /// \p ConvertToShuffle to that extract instruction.
4005ffd83dbSDimitry Andric bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
4015ffd83dbSDimitry Andric                                           ExtractElementInst *Ext1,
402349cc55cSDimitry Andric                                           const Instruction &I,
4035ffd83dbSDimitry Andric                                           ExtractElementInst *&ConvertToShuffle,
4045ffd83dbSDimitry Andric                                           unsigned PreferredExtractIndex) {
40581ad6265SDimitry Andric   auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
40681ad6265SDimitry Andric   auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
40781ad6265SDimitry Andric   assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
40881ad6265SDimitry Andric 
409349cc55cSDimitry Andric   unsigned Opcode = I.getOpcode();
4105ffd83dbSDimitry Andric   Type *ScalarTy = Ext0->getType();
4115ffd83dbSDimitry Andric   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
412e8d8bef9SDimitry Andric   InstructionCost ScalarOpCost, VectorOpCost;
4135ffd83dbSDimitry Andric 
4145ffd83dbSDimitry Andric   // Get cost estimates for scalar and vector versions of the operation.
4155ffd83dbSDimitry Andric   bool IsBinOp = Instruction::isBinaryOp(Opcode);
4165ffd83dbSDimitry Andric   if (IsBinOp) {
4175ffd83dbSDimitry Andric     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
4185ffd83dbSDimitry Andric     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
4195ffd83dbSDimitry Andric   } else {
4205ffd83dbSDimitry Andric     assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
4215ffd83dbSDimitry Andric            "Expected a compare");
422349cc55cSDimitry Andric     CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
423349cc55cSDimitry Andric     ScalarOpCost = TTI.getCmpSelInstrCost(
424349cc55cSDimitry Andric         Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
425349cc55cSDimitry Andric     VectorOpCost = TTI.getCmpSelInstrCost(
426349cc55cSDimitry Andric         Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
4275ffd83dbSDimitry Andric   }
4285ffd83dbSDimitry Andric 
4295ffd83dbSDimitry Andric   // Get cost estimates for the extract elements. These costs will factor into
4305ffd83dbSDimitry Andric   // both sequences.
43181ad6265SDimitry Andric   unsigned Ext0Index = Ext0IndexC->getZExtValue();
43281ad6265SDimitry Andric   unsigned Ext1Index = Ext1IndexC->getZExtValue();
433bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4345ffd83dbSDimitry Andric 
435e8d8bef9SDimitry Andric   InstructionCost Extract0Cost =
436bdd1243dSDimitry Andric       TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
437e8d8bef9SDimitry Andric   InstructionCost Extract1Cost =
438bdd1243dSDimitry Andric       TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
4395ffd83dbSDimitry Andric 
4405ffd83dbSDimitry Andric   // A more expensive extract will always be replaced by a splat shuffle.
4415ffd83dbSDimitry Andric   // For example, if Ext0 is more expensive:
4425ffd83dbSDimitry Andric   // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
4435ffd83dbSDimitry Andric   // extelt (opcode (splat V0, Ext0), V1), Ext1
4445ffd83dbSDimitry Andric   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
4455ffd83dbSDimitry Andric   //       check the cost of creating a broadcast shuffle and shuffling both
4465ffd83dbSDimitry Andric   //       operands to element 0.
447e8d8bef9SDimitry Andric   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
4485ffd83dbSDimitry Andric 
4495ffd83dbSDimitry Andric   // Extra uses of the extracts mean that we include those costs in the
4505ffd83dbSDimitry Andric   // vector total because those instructions will not be eliminated.
451e8d8bef9SDimitry Andric   InstructionCost OldCost, NewCost;
4525ffd83dbSDimitry Andric   if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
4535ffd83dbSDimitry Andric     // Handle a special case. If the 2 extracts are identical, adjust the
4545ffd83dbSDimitry Andric     // formulas to account for that. The extra use charge allows for either the
4555ffd83dbSDimitry Andric     // CSE'd pattern or an unoptimized form with identical values:
4565ffd83dbSDimitry Andric     // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
4575ffd83dbSDimitry Andric     bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
4585ffd83dbSDimitry Andric                                   : !Ext0->hasOneUse() || !Ext1->hasOneUse();
4595ffd83dbSDimitry Andric     OldCost = CheapExtractCost + ScalarOpCost;
4605ffd83dbSDimitry Andric     NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
4615ffd83dbSDimitry Andric   } else {
4625ffd83dbSDimitry Andric     // Handle the general case. Each extract is actually a different value:
4635ffd83dbSDimitry Andric     // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
4645ffd83dbSDimitry Andric     OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
4655ffd83dbSDimitry Andric     NewCost = VectorOpCost + CheapExtractCost +
4665ffd83dbSDimitry Andric               !Ext0->hasOneUse() * Extract0Cost +
4675ffd83dbSDimitry Andric               !Ext1->hasOneUse() * Extract1Cost;
4685ffd83dbSDimitry Andric   }
4695ffd83dbSDimitry Andric 
4705ffd83dbSDimitry Andric   ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
4715ffd83dbSDimitry Andric   if (ConvertToShuffle) {
4725ffd83dbSDimitry Andric     if (IsBinOp && DisableBinopExtractShuffle)
4735ffd83dbSDimitry Andric       return true;
4745ffd83dbSDimitry Andric 
4755ffd83dbSDimitry Andric     // If we are extracting from 2 different indexes, then one operand must be
4765ffd83dbSDimitry Andric     // shuffled before performing the vector operation. The shuffle mask is
47706c3fb27SDimitry Andric     // poison except for 1 lane that is being translated to the remaining
4785ffd83dbSDimitry Andric     // extraction lane. Therefore, it is a splat shuffle. Ex:
47906c3fb27SDimitry Andric     // ShufMask = { poison, poison, 0, poison }
4805ffd83dbSDimitry Andric     // TODO: The cost model has an option for a "broadcast" shuffle
4815ffd83dbSDimitry Andric     //       (splat-from-element-0), but no option for a more general splat.
4825ffd83dbSDimitry Andric     NewCost +=
4835ffd83dbSDimitry Andric         TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
4845ffd83dbSDimitry Andric   }
4855ffd83dbSDimitry Andric 
4865ffd83dbSDimitry Andric   // Aggressively form a vector op if the cost is equal because the transform
4875ffd83dbSDimitry Andric   // may enable further optimization.
4885ffd83dbSDimitry Andric   // Codegen can reverse this transform (scalarize) if it was not profitable.
4895ffd83dbSDimitry Andric   return OldCost < NewCost;
4905ffd83dbSDimitry Andric }
4915ffd83dbSDimitry Andric 
4925ffd83dbSDimitry Andric /// Create a shuffle that translates (shifts) 1 element from the input vector
4935ffd83dbSDimitry Andric /// to a new element location.
4945ffd83dbSDimitry Andric static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
4955ffd83dbSDimitry Andric                                  unsigned NewIndex, IRBuilder<> &Builder) {
49606c3fb27SDimitry Andric   // The shuffle mask is poison except for 1 lane that is being translated
4975ffd83dbSDimitry Andric   // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
49806c3fb27SDimitry Andric   // ShufMask = { 2, poison, poison, poison }
4995ffd83dbSDimitry Andric   auto *VecTy = cast<FixedVectorType>(Vec->getType());
50006c3fb27SDimitry Andric   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
5015ffd83dbSDimitry Andric   ShufMask[NewIndex] = OldIndex;
502e8d8bef9SDimitry Andric   return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
5035ffd83dbSDimitry Andric }
5045ffd83dbSDimitry Andric 
5055ffd83dbSDimitry Andric /// Given an extract element instruction with constant index operand, shuffle
5065ffd83dbSDimitry Andric /// the source vector (shift the scalar element) to a NewIndex for extraction.
5075ffd83dbSDimitry Andric /// Return null if the input can be constant folded, so that we are not creating
5085ffd83dbSDimitry Andric /// unnecessary instructions.
5095ffd83dbSDimitry Andric static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
5105ffd83dbSDimitry Andric                                             unsigned NewIndex,
5115ffd83dbSDimitry Andric                                             IRBuilder<> &Builder) {
512753f127fSDimitry Andric   // Shufflevectors can only be created for fixed-width vectors.
513753f127fSDimitry Andric   if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType()))
514753f127fSDimitry Andric     return nullptr;
515753f127fSDimitry Andric 
5165ffd83dbSDimitry Andric   // If the extract can be constant-folded, this code is unsimplified. Defer
5175ffd83dbSDimitry Andric   // to other passes to handle that.
5185ffd83dbSDimitry Andric   Value *X = ExtElt->getVectorOperand();
5195ffd83dbSDimitry Andric   Value *C = ExtElt->getIndexOperand();
5205ffd83dbSDimitry Andric   assert(isa<ConstantInt>(C) && "Expected a constant index operand");
5215ffd83dbSDimitry Andric   if (isa<Constant>(X))
5225ffd83dbSDimitry Andric     return nullptr;
5235ffd83dbSDimitry Andric 
5245ffd83dbSDimitry Andric   Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
5255ffd83dbSDimitry Andric                                    NewIndex, Builder);
5265ffd83dbSDimitry Andric   return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
5275ffd83dbSDimitry Andric }
5285ffd83dbSDimitry Andric 
5295ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar compares to vector
5305ffd83dbSDimitry Andric /// compares followed by extract.
5315ffd83dbSDimitry Andric /// cmp (ext0 V0, C), (ext1 V1, C)
5325ffd83dbSDimitry Andric void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
5335ffd83dbSDimitry Andric                                   ExtractElementInst *Ext1, Instruction &I) {
5345ffd83dbSDimitry Andric   assert(isa<CmpInst>(&I) && "Expected a compare");
5355ffd83dbSDimitry Andric   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
5365ffd83dbSDimitry Andric              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
5375ffd83dbSDimitry Andric          "Expected matching constant extract indexes");
5385ffd83dbSDimitry Andric 
5395ffd83dbSDimitry Andric   // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
5405ffd83dbSDimitry Andric   ++NumVecCmp;
5415ffd83dbSDimitry Andric   CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
5425ffd83dbSDimitry Andric   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
5435ffd83dbSDimitry Andric   Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
5445ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
5455ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
5465ffd83dbSDimitry Andric }
5475ffd83dbSDimitry Andric 
5485ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar binops to vector
5495ffd83dbSDimitry Andric /// binops followed by extract.
5505ffd83dbSDimitry Andric /// bo (ext0 V0, C), (ext1 V1, C)
5515ffd83dbSDimitry Andric void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
5525ffd83dbSDimitry Andric                                     ExtractElementInst *Ext1, Instruction &I) {
5535ffd83dbSDimitry Andric   assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
5545ffd83dbSDimitry Andric   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
5555ffd83dbSDimitry Andric              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
5565ffd83dbSDimitry Andric          "Expected matching constant extract indexes");
5575ffd83dbSDimitry Andric 
5585ffd83dbSDimitry Andric   // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
5595ffd83dbSDimitry Andric   ++NumVecBO;
5605ffd83dbSDimitry Andric   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
5615ffd83dbSDimitry Andric   Value *VecBO =
5625ffd83dbSDimitry Andric       Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
5635ffd83dbSDimitry Andric 
5645ffd83dbSDimitry Andric   // All IR flags are safe to back-propagate because any potential poison
5655ffd83dbSDimitry Andric   // created in unused vector elements is discarded by the extract.
5665ffd83dbSDimitry Andric   if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
5675ffd83dbSDimitry Andric     VecBOInst->copyIRFlags(&I);
5685ffd83dbSDimitry Andric 
5695ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
5705ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
5715ffd83dbSDimitry Andric }
5725ffd83dbSDimitry Andric 
5735ffd83dbSDimitry Andric /// Match an instruction with extracted vector operands.
5745ffd83dbSDimitry Andric bool VectorCombine::foldExtractExtract(Instruction &I) {
5755ffd83dbSDimitry Andric   // It is not safe to transform things like div, urem, etc. because we may
5765ffd83dbSDimitry Andric   // create undefined behavior when executing those on unknown vector elements.
5775ffd83dbSDimitry Andric   if (!isSafeToSpeculativelyExecute(&I))
5785ffd83dbSDimitry Andric     return false;
5795ffd83dbSDimitry Andric 
5805ffd83dbSDimitry Andric   Instruction *I0, *I1;
5815ffd83dbSDimitry Andric   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
5825ffd83dbSDimitry Andric   if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
5835ffd83dbSDimitry Andric       !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
5845ffd83dbSDimitry Andric     return false;
5855ffd83dbSDimitry Andric 
5865ffd83dbSDimitry Andric   Value *V0, *V1;
5875ffd83dbSDimitry Andric   uint64_t C0, C1;
5885ffd83dbSDimitry Andric   if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
5895ffd83dbSDimitry Andric       !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
5905ffd83dbSDimitry Andric       V0->getType() != V1->getType())
5915ffd83dbSDimitry Andric     return false;
5925ffd83dbSDimitry Andric 
5935ffd83dbSDimitry Andric   // If the scalar value 'I' is going to be re-inserted into a vector, then try
5945ffd83dbSDimitry Andric   // to create an extract to that same element. The extract/insert can be
5955ffd83dbSDimitry Andric   // reduced to a "select shuffle".
5965ffd83dbSDimitry Andric   // TODO: If we add a larger pattern match that starts from an insert, this
5975ffd83dbSDimitry Andric   //       probably becomes unnecessary.
5985ffd83dbSDimitry Andric   auto *Ext0 = cast<ExtractElementInst>(I0);
5995ffd83dbSDimitry Andric   auto *Ext1 = cast<ExtractElementInst>(I1);
6005ffd83dbSDimitry Andric   uint64_t InsertIndex = InvalidIndex;
6015ffd83dbSDimitry Andric   if (I.hasOneUse())
6025ffd83dbSDimitry Andric     match(I.user_back(),
6035ffd83dbSDimitry Andric           m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
6045ffd83dbSDimitry Andric 
6055ffd83dbSDimitry Andric   ExtractElementInst *ExtractToChange;
606349cc55cSDimitry Andric   if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
6075ffd83dbSDimitry Andric     return false;
6085ffd83dbSDimitry Andric 
6095ffd83dbSDimitry Andric   if (ExtractToChange) {
6105ffd83dbSDimitry Andric     unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
6115ffd83dbSDimitry Andric     ExtractElementInst *NewExtract =
6125ffd83dbSDimitry Andric         translateExtract(ExtractToChange, CheapExtractIdx, Builder);
6135ffd83dbSDimitry Andric     if (!NewExtract)
6145ffd83dbSDimitry Andric       return false;
6155ffd83dbSDimitry Andric     if (ExtractToChange == Ext0)
6165ffd83dbSDimitry Andric       Ext0 = NewExtract;
6175ffd83dbSDimitry Andric     else
6185ffd83dbSDimitry Andric       Ext1 = NewExtract;
6195ffd83dbSDimitry Andric   }
6205ffd83dbSDimitry Andric 
6215ffd83dbSDimitry Andric   if (Pred != CmpInst::BAD_ICMP_PREDICATE)
6225ffd83dbSDimitry Andric     foldExtExtCmp(Ext0, Ext1, I);
6235ffd83dbSDimitry Andric   else
6245ffd83dbSDimitry Andric     foldExtExtBinop(Ext0, Ext1, I);
6255ffd83dbSDimitry Andric 
626349cc55cSDimitry Andric   Worklist.push(Ext0);
627349cc55cSDimitry Andric   Worklist.push(Ext1);
6285ffd83dbSDimitry Andric   return true;
6295ffd83dbSDimitry Andric }
6305ffd83dbSDimitry Andric 
631bdd1243dSDimitry Andric /// Try to replace an extract + scalar fneg + insert with a vector fneg +
632bdd1243dSDimitry Andric /// shuffle.
633bdd1243dSDimitry Andric bool VectorCombine::foldInsExtFNeg(Instruction &I) {
634bdd1243dSDimitry Andric   // Match an insert (op (extract)) pattern.
635bdd1243dSDimitry Andric   Value *DestVec;
636bdd1243dSDimitry Andric   uint64_t Index;
637bdd1243dSDimitry Andric   Instruction *FNeg;
638bdd1243dSDimitry Andric   if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
639bdd1243dSDimitry Andric                              m_ConstantInt(Index))))
640bdd1243dSDimitry Andric     return false;
641bdd1243dSDimitry Andric 
642bdd1243dSDimitry Andric   // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
643bdd1243dSDimitry Andric   Value *SrcVec;
644bdd1243dSDimitry Andric   Instruction *Extract;
645bdd1243dSDimitry Andric   if (!match(FNeg, m_FNeg(m_CombineAnd(
646bdd1243dSDimitry Andric                        m_Instruction(Extract),
647bdd1243dSDimitry Andric                        m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
648bdd1243dSDimitry Andric     return false;
649bdd1243dSDimitry Andric 
650bdd1243dSDimitry Andric   // TODO: We could handle this with a length-changing shuffle.
651bdd1243dSDimitry Andric   auto *VecTy = cast<FixedVectorType>(I.getType());
652bdd1243dSDimitry Andric   if (SrcVec->getType() != VecTy)
653bdd1243dSDimitry Andric     return false;
654bdd1243dSDimitry Andric 
655bdd1243dSDimitry Andric   // Ignore bogus insert/extract index.
656bdd1243dSDimitry Andric   unsigned NumElts = VecTy->getNumElements();
657bdd1243dSDimitry Andric   if (Index >= NumElts)
658bdd1243dSDimitry Andric     return false;
659bdd1243dSDimitry Andric 
660bdd1243dSDimitry Andric   // We are inserting the negated element into the same lane that we extracted
661bdd1243dSDimitry Andric   // from. This is equivalent to a select-shuffle that chooses all but the
662bdd1243dSDimitry Andric   // negated element from the destination vector.
663bdd1243dSDimitry Andric   SmallVector<int> Mask(NumElts);
664bdd1243dSDimitry Andric   std::iota(Mask.begin(), Mask.end(), 0);
665bdd1243dSDimitry Andric   Mask[Index] = Index + NumElts;
666bdd1243dSDimitry Andric 
667bdd1243dSDimitry Andric   Type *ScalarTy = VecTy->getScalarType();
668bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
669bdd1243dSDimitry Andric   InstructionCost OldCost =
670bdd1243dSDimitry Andric       TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
671bdd1243dSDimitry Andric       TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
672bdd1243dSDimitry Andric 
673bdd1243dSDimitry Andric   // If the extract has one use, it will be eliminated, so count it in the
674bdd1243dSDimitry Andric   // original cost. If it has more than one use, ignore the cost because it will
675bdd1243dSDimitry Andric   // be the same before/after.
676bdd1243dSDimitry Andric   if (Extract->hasOneUse())
677bdd1243dSDimitry Andric     OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
678bdd1243dSDimitry Andric 
679bdd1243dSDimitry Andric   InstructionCost NewCost =
680bdd1243dSDimitry Andric       TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
681bdd1243dSDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
682bdd1243dSDimitry Andric 
683bdd1243dSDimitry Andric   if (NewCost > OldCost)
684bdd1243dSDimitry Andric     return false;
685bdd1243dSDimitry Andric 
686bdd1243dSDimitry Andric   // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
687bdd1243dSDimitry Andric   // shuffle DestVec, (fneg SrcVec), Mask
688bdd1243dSDimitry Andric   Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
689bdd1243dSDimitry Andric   Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
690bdd1243dSDimitry Andric   replaceValue(I, *Shuf);
691bdd1243dSDimitry Andric   return true;
692bdd1243dSDimitry Andric }
693bdd1243dSDimitry Andric 
6945ffd83dbSDimitry Andric /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
6955ffd83dbSDimitry Andric /// destination type followed by shuffle. This can enable further transforms by
6965ffd83dbSDimitry Andric /// moving bitcasts or shuffles together.
6975f757f3fSDimitry Andric bool VectorCombine::foldBitcastShuffle(Instruction &I) {
6980fca6ea1SDimitry Andric   Value *V0, *V1;
6995ffd83dbSDimitry Andric   ArrayRef<int> Mask;
7000fca6ea1SDimitry Andric   if (!match(&I, m_BitCast(m_OneUse(
7010fca6ea1SDimitry Andric                      m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
7025ffd83dbSDimitry Andric     return false;
7035ffd83dbSDimitry Andric 
704e8d8bef9SDimitry Andric   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
705e8d8bef9SDimitry Andric   // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
706e8d8bef9SDimitry Andric   // mask for scalable type is a splat or not.
7075f757f3fSDimitry Andric   // 2) Disallow non-vector casts.
7085ffd83dbSDimitry Andric   // TODO: We could allow any shuffle.
7095f757f3fSDimitry Andric   auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
7100fca6ea1SDimitry Andric   auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
7115f757f3fSDimitry Andric   if (!DestTy || !SrcTy)
7125ffd83dbSDimitry Andric     return false;
7135ffd83dbSDimitry Andric 
7145f757f3fSDimitry Andric   unsigned DestEltSize = DestTy->getScalarSizeInBits();
7155f757f3fSDimitry Andric   unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
7165f757f3fSDimitry Andric   if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
7175f757f3fSDimitry Andric     return false;
7185f757f3fSDimitry Andric 
7190fca6ea1SDimitry Andric   bool IsUnary = isa<UndefValue>(V1);
7200fca6ea1SDimitry Andric 
7210fca6ea1SDimitry Andric   // For binary shuffles, only fold bitcast(shuffle(X,Y))
7220fca6ea1SDimitry Andric   // if it won't increase the number of bitcasts.
7230fca6ea1SDimitry Andric   if (!IsUnary) {
7240fca6ea1SDimitry Andric     auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType());
7250fca6ea1SDimitry Andric     auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType());
7260fca6ea1SDimitry Andric     if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
7270fca6ea1SDimitry Andric         !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
7280fca6ea1SDimitry Andric       return false;
7290fca6ea1SDimitry Andric   }
7300fca6ea1SDimitry Andric 
7315ffd83dbSDimitry Andric   SmallVector<int, 16> NewMask;
7325f757f3fSDimitry Andric   if (DestEltSize <= SrcEltSize) {
7335ffd83dbSDimitry Andric     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
7345ffd83dbSDimitry Andric     // always be expanded to the equivalent form choosing narrower elements.
7355f757f3fSDimitry Andric     assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
7365f757f3fSDimitry Andric     unsigned ScaleFactor = SrcEltSize / DestEltSize;
7375ffd83dbSDimitry Andric     narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
7385ffd83dbSDimitry Andric   } else {
7395ffd83dbSDimitry Andric     // The bitcast is from narrow elements to wide elements. The shuffle mask
7405ffd83dbSDimitry Andric     // must choose consecutive elements to allow casting first.
7415f757f3fSDimitry Andric     assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
7425f757f3fSDimitry Andric     unsigned ScaleFactor = DestEltSize / SrcEltSize;
7435ffd83dbSDimitry Andric     if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
7445ffd83dbSDimitry Andric       return false;
7455ffd83dbSDimitry Andric   }
746fe6060f1SDimitry Andric 
7475f757f3fSDimitry Andric   // Bitcast the shuffle src - keep its original width but using the destination
7485f757f3fSDimitry Andric   // scalar type.
7495f757f3fSDimitry Andric   unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
7500fca6ea1SDimitry Andric   auto *NewShuffleTy =
7510fca6ea1SDimitry Andric       FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
7520fca6ea1SDimitry Andric   auto *OldShuffleTy =
7530fca6ea1SDimitry Andric       FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
7540fca6ea1SDimitry Andric   unsigned NumOps = IsUnary ? 1 : 2;
7555f757f3fSDimitry Andric 
7560fca6ea1SDimitry Andric   // The new shuffle must not cost more than the old shuffle.
7570fca6ea1SDimitry Andric   TargetTransformInfo::TargetCostKind CK =
7580fca6ea1SDimitry Andric       TargetTransformInfo::TCK_RecipThroughput;
7590fca6ea1SDimitry Andric   TargetTransformInfo::ShuffleKind SK =
7600fca6ea1SDimitry Andric       IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
7610fca6ea1SDimitry Andric               : TargetTransformInfo::SK_PermuteTwoSrc;
7620fca6ea1SDimitry Andric 
7630fca6ea1SDimitry Andric   InstructionCost DestCost =
7640fca6ea1SDimitry Andric       TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
7650fca6ea1SDimitry Andric       (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
7660fca6ea1SDimitry Andric                                      TargetTransformInfo::CastContextHint::None,
7670fca6ea1SDimitry Andric                                      CK));
768fe6060f1SDimitry Andric   InstructionCost SrcCost =
7690fca6ea1SDimitry Andric       TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
7700fca6ea1SDimitry Andric       TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
7710fca6ea1SDimitry Andric                            TargetTransformInfo::CastContextHint::None, CK);
772fe6060f1SDimitry Andric   if (DestCost > SrcCost || !DestCost.isValid())
773fe6060f1SDimitry Andric     return false;
774fe6060f1SDimitry Andric 
7750fca6ea1SDimitry Andric   // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
7765ffd83dbSDimitry Andric   ++NumShufOfBitcast;
7770fca6ea1SDimitry Andric   Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
7780fca6ea1SDimitry Andric   Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
7790fca6ea1SDimitry Andric   Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
7805ffd83dbSDimitry Andric   replaceValue(I, *Shuf);
7815ffd83dbSDimitry Andric   return true;
7825ffd83dbSDimitry Andric }
7835ffd83dbSDimitry Andric 
7845f757f3fSDimitry Andric /// VP Intrinsics whose vector operands are both splat values may be simplified
7855f757f3fSDimitry Andric /// into the scalar version of the operation and the result splatted. This
7865f757f3fSDimitry Andric /// can lead to scalarization down the line.
7875f757f3fSDimitry Andric bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
7885f757f3fSDimitry Andric   if (!isa<VPIntrinsic>(I))
7895f757f3fSDimitry Andric     return false;
7905f757f3fSDimitry Andric   VPIntrinsic &VPI = cast<VPIntrinsic>(I);
7915f757f3fSDimitry Andric   Value *Op0 = VPI.getArgOperand(0);
7925f757f3fSDimitry Andric   Value *Op1 = VPI.getArgOperand(1);
7935f757f3fSDimitry Andric 
7945f757f3fSDimitry Andric   if (!isSplatValue(Op0) || !isSplatValue(Op1))
7955f757f3fSDimitry Andric     return false;
7965f757f3fSDimitry Andric 
7975f757f3fSDimitry Andric   // Check getSplatValue early in this function, to avoid doing unnecessary
7985f757f3fSDimitry Andric   // work.
7995f757f3fSDimitry Andric   Value *ScalarOp0 = getSplatValue(Op0);
8005f757f3fSDimitry Andric   Value *ScalarOp1 = getSplatValue(Op1);
8015f757f3fSDimitry Andric   if (!ScalarOp0 || !ScalarOp1)
8025f757f3fSDimitry Andric     return false;
8035f757f3fSDimitry Andric 
8045f757f3fSDimitry Andric   // For the binary VP intrinsics supported here, the result on disabled lanes
8055f757f3fSDimitry Andric   // is a poison value. For now, only do this simplification if all lanes
8065f757f3fSDimitry Andric   // are active.
8075f757f3fSDimitry Andric   // TODO: Relax the condition that all lanes are active by using insertelement
8085f757f3fSDimitry Andric   // on inactive lanes.
8095f757f3fSDimitry Andric   auto IsAllTrueMask = [](Value *MaskVal) {
8105f757f3fSDimitry Andric     if (Value *SplattedVal = getSplatValue(MaskVal))
8115f757f3fSDimitry Andric       if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
8125f757f3fSDimitry Andric         return ConstValue->isAllOnesValue();
8135f757f3fSDimitry Andric     return false;
8145f757f3fSDimitry Andric   };
8155f757f3fSDimitry Andric   if (!IsAllTrueMask(VPI.getArgOperand(2)))
8165f757f3fSDimitry Andric     return false;
8175f757f3fSDimitry Andric 
8185f757f3fSDimitry Andric   // Check to make sure we support scalarization of the intrinsic
8195f757f3fSDimitry Andric   Intrinsic::ID IntrID = VPI.getIntrinsicID();
8205f757f3fSDimitry Andric   if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
8215f757f3fSDimitry Andric     return false;
8225f757f3fSDimitry Andric 
8235f757f3fSDimitry Andric   // Calculate cost of splatting both operands into vectors and the vector
8245f757f3fSDimitry Andric   // intrinsic
8255f757f3fSDimitry Andric   VectorType *VecTy = cast<VectorType>(VPI.getType());
8265f757f3fSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8270fca6ea1SDimitry Andric   SmallVector<int> Mask;
8280fca6ea1SDimitry Andric   if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
8290fca6ea1SDimitry Andric     Mask.resize(FVTy->getNumElements(), 0);
8305f757f3fSDimitry Andric   InstructionCost SplatCost =
8315f757f3fSDimitry Andric       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
8320fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask);
8335f757f3fSDimitry Andric 
8345f757f3fSDimitry Andric   // Calculate the cost of the VP Intrinsic
8355f757f3fSDimitry Andric   SmallVector<Type *, 4> Args;
8365f757f3fSDimitry Andric   for (Value *V : VPI.args())
8375f757f3fSDimitry Andric     Args.push_back(V->getType());
8385f757f3fSDimitry Andric   IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
8395f757f3fSDimitry Andric   InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
8405f757f3fSDimitry Andric   InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
8415f757f3fSDimitry Andric 
8425f757f3fSDimitry Andric   // Determine scalar opcode
8435f757f3fSDimitry Andric   std::optional<unsigned> FunctionalOpcode =
8445f757f3fSDimitry Andric       VPI.getFunctionalOpcode();
8455f757f3fSDimitry Andric   std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
8465f757f3fSDimitry Andric   if (!FunctionalOpcode) {
8475f757f3fSDimitry Andric     ScalarIntrID = VPI.getFunctionalIntrinsicID();
8485f757f3fSDimitry Andric     if (!ScalarIntrID)
8495f757f3fSDimitry Andric       return false;
8505f757f3fSDimitry Andric   }
8515f757f3fSDimitry Andric 
8525f757f3fSDimitry Andric   // Calculate cost of scalarizing
8535f757f3fSDimitry Andric   InstructionCost ScalarOpCost = 0;
8545f757f3fSDimitry Andric   if (ScalarIntrID) {
8555f757f3fSDimitry Andric     IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
8565f757f3fSDimitry Andric     ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
8575f757f3fSDimitry Andric   } else {
8585f757f3fSDimitry Andric     ScalarOpCost =
8595f757f3fSDimitry Andric         TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType());
8605f757f3fSDimitry Andric   }
8615f757f3fSDimitry Andric 
8625f757f3fSDimitry Andric   // The existing splats may be kept around if other instructions use them.
8635f757f3fSDimitry Andric   InstructionCost CostToKeepSplats =
8645f757f3fSDimitry Andric       (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
8655f757f3fSDimitry Andric   InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
8665f757f3fSDimitry Andric 
8675f757f3fSDimitry Andric   LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
8685f757f3fSDimitry Andric                     << "\n");
8695f757f3fSDimitry Andric   LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
8705f757f3fSDimitry Andric                     << ", Cost of scalarizing:" << NewCost << "\n");
8715f757f3fSDimitry Andric 
8725f757f3fSDimitry Andric   // We want to scalarize unless the vector variant actually has lower cost.
8735f757f3fSDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
8745f757f3fSDimitry Andric     return false;
8755f757f3fSDimitry Andric 
8765f757f3fSDimitry Andric   // Scalarize the intrinsic
8775f757f3fSDimitry Andric   ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
8785f757f3fSDimitry Andric   Value *EVL = VPI.getArgOperand(3);
8795f757f3fSDimitry Andric 
8805f757f3fSDimitry Andric   // If the VP op might introduce UB or poison, we can scalarize it provided
8815f757f3fSDimitry Andric   // that we know the EVL > 0: If the EVL is zero, then the original VP op
8825f757f3fSDimitry Andric   // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
8835f757f3fSDimitry Andric   // scalarizing it.
8845f757f3fSDimitry Andric   bool SafeToSpeculate;
8855f757f3fSDimitry Andric   if (ScalarIntrID)
8865f757f3fSDimitry Andric     SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID)
8875f757f3fSDimitry Andric                           .hasFnAttr(Attribute::AttrKind::Speculatable);
8885f757f3fSDimitry Andric   else
8895f757f3fSDimitry Andric     SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
8905f757f3fSDimitry Andric         *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
8910fca6ea1SDimitry Andric   if (!SafeToSpeculate &&
8920fca6ea1SDimitry Andric       !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
8935f757f3fSDimitry Andric     return false;
8945f757f3fSDimitry Andric 
8955f757f3fSDimitry Andric   Value *ScalarVal =
8965f757f3fSDimitry Andric       ScalarIntrID
8975f757f3fSDimitry Andric           ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
8985f757f3fSDimitry Andric                                     {ScalarOp0, ScalarOp1})
8995f757f3fSDimitry Andric           : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
9005f757f3fSDimitry Andric                                 ScalarOp0, ScalarOp1);
9015f757f3fSDimitry Andric 
9025f757f3fSDimitry Andric   replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
9035f757f3fSDimitry Andric   return true;
9045f757f3fSDimitry Andric }
9055f757f3fSDimitry Andric 
9065ffd83dbSDimitry Andric /// Match a vector binop or compare instruction with at least one inserted
9075ffd83dbSDimitry Andric /// scalar operand and convert to scalar binop/cmp followed by insertelement.
9085ffd83dbSDimitry Andric bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
9095ffd83dbSDimitry Andric   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
9105ffd83dbSDimitry Andric   Value *Ins0, *Ins1;
9115ffd83dbSDimitry Andric   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
9125ffd83dbSDimitry Andric       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
9135ffd83dbSDimitry Andric     return false;
9145ffd83dbSDimitry Andric 
9155ffd83dbSDimitry Andric   // Do not convert the vector condition of a vector select into a scalar
9165ffd83dbSDimitry Andric   // condition. That may cause problems for codegen because of differences in
9175ffd83dbSDimitry Andric   // boolean formats and register-file transfers.
9185ffd83dbSDimitry Andric   // TODO: Can we account for that in the cost model?
9195ffd83dbSDimitry Andric   bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
9205ffd83dbSDimitry Andric   if (IsCmp)
9215ffd83dbSDimitry Andric     for (User *U : I.users())
9225ffd83dbSDimitry Andric       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
9235ffd83dbSDimitry Andric         return false;
9245ffd83dbSDimitry Andric 
9255ffd83dbSDimitry Andric   // Match against one or both scalar values being inserted into constant
9265ffd83dbSDimitry Andric   // vectors:
9275ffd83dbSDimitry Andric   // vec_op VecC0, (inselt VecC1, V1, Index)
9285ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), VecC1
9295ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
9305ffd83dbSDimitry Andric   // TODO: Deal with mismatched index constants and variable indexes?
9315ffd83dbSDimitry Andric   Constant *VecC0 = nullptr, *VecC1 = nullptr;
9325ffd83dbSDimitry Andric   Value *V0 = nullptr, *V1 = nullptr;
9335ffd83dbSDimitry Andric   uint64_t Index0 = 0, Index1 = 0;
9345ffd83dbSDimitry Andric   if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
9355ffd83dbSDimitry Andric                                m_ConstantInt(Index0))) &&
9365ffd83dbSDimitry Andric       !match(Ins0, m_Constant(VecC0)))
9375ffd83dbSDimitry Andric     return false;
9385ffd83dbSDimitry Andric   if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
9395ffd83dbSDimitry Andric                                m_ConstantInt(Index1))) &&
9405ffd83dbSDimitry Andric       !match(Ins1, m_Constant(VecC1)))
9415ffd83dbSDimitry Andric     return false;
9425ffd83dbSDimitry Andric 
9435ffd83dbSDimitry Andric   bool IsConst0 = !V0;
9445ffd83dbSDimitry Andric   bool IsConst1 = !V1;
9455ffd83dbSDimitry Andric   if (IsConst0 && IsConst1)
9465ffd83dbSDimitry Andric     return false;
9475ffd83dbSDimitry Andric   if (!IsConst0 && !IsConst1 && Index0 != Index1)
9485ffd83dbSDimitry Andric     return false;
9495ffd83dbSDimitry Andric 
9505ffd83dbSDimitry Andric   // Bail for single insertion if it is a load.
9515ffd83dbSDimitry Andric   // TODO: Handle this once getVectorInstrCost can cost for load/stores.
9525ffd83dbSDimitry Andric   auto *I0 = dyn_cast_or_null<Instruction>(V0);
9535ffd83dbSDimitry Andric   auto *I1 = dyn_cast_or_null<Instruction>(V1);
9545ffd83dbSDimitry Andric   if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
9555ffd83dbSDimitry Andric       (IsConst1 && I0 && I0->mayReadFromMemory()))
9565ffd83dbSDimitry Andric     return false;
9575ffd83dbSDimitry Andric 
9585ffd83dbSDimitry Andric   uint64_t Index = IsConst0 ? Index1 : Index0;
9595ffd83dbSDimitry Andric   Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
9605ffd83dbSDimitry Andric   Type *VecTy = I.getType();
9615ffd83dbSDimitry Andric   assert(VecTy->isVectorTy() &&
9625ffd83dbSDimitry Andric          (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
9635ffd83dbSDimitry Andric          (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
9645ffd83dbSDimitry Andric           ScalarTy->isPointerTy()) &&
9655ffd83dbSDimitry Andric          "Unexpected types for insert element into binop or cmp");
9665ffd83dbSDimitry Andric 
9675ffd83dbSDimitry Andric   unsigned Opcode = I.getOpcode();
968e8d8bef9SDimitry Andric   InstructionCost ScalarOpCost, VectorOpCost;
9695ffd83dbSDimitry Andric   if (IsCmp) {
970349cc55cSDimitry Andric     CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
971349cc55cSDimitry Andric     ScalarOpCost = TTI.getCmpSelInstrCost(
972349cc55cSDimitry Andric         Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
973349cc55cSDimitry Andric     VectorOpCost = TTI.getCmpSelInstrCost(
974349cc55cSDimitry Andric         Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
9755ffd83dbSDimitry Andric   } else {
9765ffd83dbSDimitry Andric     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
9775ffd83dbSDimitry Andric     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
9785ffd83dbSDimitry Andric   }
9795ffd83dbSDimitry Andric 
9805ffd83dbSDimitry Andric   // Get cost estimate for the insert element. This cost will factor into
9815ffd83dbSDimitry Andric   // both sequences.
982bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
983bdd1243dSDimitry Andric   InstructionCost InsertCost = TTI.getVectorInstrCost(
984bdd1243dSDimitry Andric       Instruction::InsertElement, VecTy, CostKind, Index);
985e8d8bef9SDimitry Andric   InstructionCost OldCost =
986e8d8bef9SDimitry Andric       (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
987e8d8bef9SDimitry Andric   InstructionCost NewCost = ScalarOpCost + InsertCost +
9885ffd83dbSDimitry Andric                             (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
9895ffd83dbSDimitry Andric                             (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
9905ffd83dbSDimitry Andric 
9915ffd83dbSDimitry Andric   // We want to scalarize unless the vector variant actually has lower cost.
992e8d8bef9SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
9935ffd83dbSDimitry Andric     return false;
9945ffd83dbSDimitry Andric 
9955ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
9965ffd83dbSDimitry Andric   // inselt NewVecC, (scalar_op V0, V1), Index
9975ffd83dbSDimitry Andric   if (IsCmp)
9985ffd83dbSDimitry Andric     ++NumScalarCmp;
9995ffd83dbSDimitry Andric   else
10005ffd83dbSDimitry Andric     ++NumScalarBO;
10015ffd83dbSDimitry Andric 
10025ffd83dbSDimitry Andric   // For constant cases, extract the scalar element, this should constant fold.
10035ffd83dbSDimitry Andric   if (IsConst0)
10045ffd83dbSDimitry Andric     V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
10055ffd83dbSDimitry Andric   if (IsConst1)
10065ffd83dbSDimitry Andric     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
10075ffd83dbSDimitry Andric 
10085ffd83dbSDimitry Andric   Value *Scalar =
10095ffd83dbSDimitry Andric       IsCmp ? Builder.CreateCmp(Pred, V0, V1)
10105ffd83dbSDimitry Andric             : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
10115ffd83dbSDimitry Andric 
10125ffd83dbSDimitry Andric   Scalar->setName(I.getName() + ".scalar");
10135ffd83dbSDimitry Andric 
10145ffd83dbSDimitry Andric   // All IR flags are safe to back-propagate. There is no potential for extra
10155ffd83dbSDimitry Andric   // poison to be created by the scalar instruction.
10165ffd83dbSDimitry Andric   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
10175ffd83dbSDimitry Andric     ScalarInst->copyIRFlags(&I);
10185ffd83dbSDimitry Andric 
10195ffd83dbSDimitry Andric   // Fold the vector constants in the original vectors into a new base vector.
102081ad6265SDimitry Andric   Value *NewVecC =
102181ad6265SDimitry Andric       IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
102281ad6265SDimitry Andric             : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
10235ffd83dbSDimitry Andric   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
10245ffd83dbSDimitry Andric   replaceValue(I, *Insert);
10255ffd83dbSDimitry Andric   return true;
10265ffd83dbSDimitry Andric }
10275ffd83dbSDimitry Andric 
10285ffd83dbSDimitry Andric /// Try to combine a scalar binop + 2 scalar compares of extracted elements of
10295ffd83dbSDimitry Andric /// a vector into vector operations followed by extract. Note: The SLP pass
10305ffd83dbSDimitry Andric /// may miss this pattern because of implementation problems.
10315ffd83dbSDimitry Andric bool VectorCombine::foldExtractedCmps(Instruction &I) {
10325ffd83dbSDimitry Andric   // We are looking for a scalar binop of booleans.
10335ffd83dbSDimitry Andric   // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
10345ffd83dbSDimitry Andric   if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
10355ffd83dbSDimitry Andric     return false;
10365ffd83dbSDimitry Andric 
10375ffd83dbSDimitry Andric   // The compare predicates should match, and each compare should have a
10385ffd83dbSDimitry Andric   // constant operand.
10395ffd83dbSDimitry Andric   // TODO: Relax the one-use constraints.
10405ffd83dbSDimitry Andric   Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
10415ffd83dbSDimitry Andric   Instruction *I0, *I1;
10425ffd83dbSDimitry Andric   Constant *C0, *C1;
10435ffd83dbSDimitry Andric   CmpInst::Predicate P0, P1;
10445ffd83dbSDimitry Andric   if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
10455ffd83dbSDimitry Andric       !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
10465ffd83dbSDimitry Andric       P0 != P1)
10475ffd83dbSDimitry Andric     return false;
10485ffd83dbSDimitry Andric 
10495ffd83dbSDimitry Andric   // The compare operands must be extracts of the same vector with constant
10505ffd83dbSDimitry Andric   // extract indexes.
10515ffd83dbSDimitry Andric   // TODO: Relax the one-use constraints.
10525ffd83dbSDimitry Andric   Value *X;
10535ffd83dbSDimitry Andric   uint64_t Index0, Index1;
10545ffd83dbSDimitry Andric   if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
10555ffd83dbSDimitry Andric       !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
10565ffd83dbSDimitry Andric     return false;
10575ffd83dbSDimitry Andric 
10585ffd83dbSDimitry Andric   auto *Ext0 = cast<ExtractElementInst>(I0);
10595ffd83dbSDimitry Andric   auto *Ext1 = cast<ExtractElementInst>(I1);
10605ffd83dbSDimitry Andric   ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
10615ffd83dbSDimitry Andric   if (!ConvertToShuf)
10625ffd83dbSDimitry Andric     return false;
10635ffd83dbSDimitry Andric 
10645ffd83dbSDimitry Andric   // The original scalar pattern is:
10655ffd83dbSDimitry Andric   // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
10665ffd83dbSDimitry Andric   CmpInst::Predicate Pred = P0;
10675ffd83dbSDimitry Andric   unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
10685ffd83dbSDimitry Andric                                                     : Instruction::ICmp;
10695ffd83dbSDimitry Andric   auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
10705ffd83dbSDimitry Andric   if (!VecTy)
10715ffd83dbSDimitry Andric     return false;
10725ffd83dbSDimitry Andric 
1073bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1074e8d8bef9SDimitry Andric   InstructionCost OldCost =
1075bdd1243dSDimitry Andric       TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1076bdd1243dSDimitry Andric   OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1077349cc55cSDimitry Andric   OldCost +=
1078349cc55cSDimitry Andric       TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
1079349cc55cSDimitry Andric                              CmpInst::makeCmpResultType(I0->getType()), Pred) *
1080349cc55cSDimitry Andric       2;
10815ffd83dbSDimitry Andric   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
10825ffd83dbSDimitry Andric 
10835ffd83dbSDimitry Andric   // The proposed vector pattern is:
10845ffd83dbSDimitry Andric   // vcmp = cmp Pred X, VecC
10855ffd83dbSDimitry Andric   // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
10865ffd83dbSDimitry Andric   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
10875ffd83dbSDimitry Andric   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
10885ffd83dbSDimitry Andric   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
1089349cc55cSDimitry Andric   InstructionCost NewCost = TTI.getCmpSelInstrCost(
1090349cc55cSDimitry Andric       CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
109106c3fb27SDimitry Andric   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1092fe6060f1SDimitry Andric   ShufMask[CheapIndex] = ExpensiveIndex;
1093fe6060f1SDimitry Andric   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
1094fe6060f1SDimitry Andric                                 ShufMask);
10955ffd83dbSDimitry Andric   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
1096bdd1243dSDimitry Andric   NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
10975ffd83dbSDimitry Andric 
10985ffd83dbSDimitry Andric   // Aggressively form vector ops if the cost is equal because the transform
10995ffd83dbSDimitry Andric   // may enable further optimization.
11005ffd83dbSDimitry Andric   // Codegen can reverse this transform (scalarize) if it was not profitable.
1101e8d8bef9SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
11025ffd83dbSDimitry Andric     return false;
11035ffd83dbSDimitry Andric 
11045ffd83dbSDimitry Andric   // Create a vector constant from the 2 scalar constants.
11055ffd83dbSDimitry Andric   SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
110606c3fb27SDimitry Andric                                    PoisonValue::get(VecTy->getElementType()));
11075ffd83dbSDimitry Andric   CmpC[Index0] = C0;
11085ffd83dbSDimitry Andric   CmpC[Index1] = C1;
11095ffd83dbSDimitry Andric   Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
11105ffd83dbSDimitry Andric 
11115ffd83dbSDimitry Andric   Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
11125ffd83dbSDimitry Andric   Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
11135ffd83dbSDimitry Andric                                         VCmp, Shuf);
11145ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
11155ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
11165ffd83dbSDimitry Andric   ++NumVecCmpBO;
11175ffd83dbSDimitry Andric   return true;
11185ffd83dbSDimitry Andric }
11195ffd83dbSDimitry Andric 
1120fe6060f1SDimitry Andric // Check if memory loc modified between two instrs in the same BB
1121fe6060f1SDimitry Andric static bool isMemModifiedBetween(BasicBlock::iterator Begin,
1122fe6060f1SDimitry Andric                                  BasicBlock::iterator End,
1123fe6060f1SDimitry Andric                                  const MemoryLocation &Loc, AAResults &AA) {
1124fe6060f1SDimitry Andric   unsigned NumScanned = 0;
1125fe6060f1SDimitry Andric   return std::any_of(Begin, End, [&](const Instruction &Instr) {
1126fe6060f1SDimitry Andric     return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1127fe6060f1SDimitry Andric            ++NumScanned > MaxInstrsToScan;
1128fe6060f1SDimitry Andric   });
1129fe6060f1SDimitry Andric }
1130fe6060f1SDimitry Andric 
1131bdd1243dSDimitry Andric namespace {
1132349cc55cSDimitry Andric /// Helper class to indicate whether a vector index can be safely scalarized and
1133349cc55cSDimitry Andric /// if a freeze needs to be inserted.
1134349cc55cSDimitry Andric class ScalarizationResult {
1135349cc55cSDimitry Andric   enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1136349cc55cSDimitry Andric 
1137349cc55cSDimitry Andric   StatusTy Status;
1138349cc55cSDimitry Andric   Value *ToFreeze;
1139349cc55cSDimitry Andric 
1140349cc55cSDimitry Andric   ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1141349cc55cSDimitry Andric       : Status(Status), ToFreeze(ToFreeze) {}
1142349cc55cSDimitry Andric 
1143349cc55cSDimitry Andric public:
1144349cc55cSDimitry Andric   ScalarizationResult(const ScalarizationResult &Other) = default;
1145349cc55cSDimitry Andric   ~ScalarizationResult() {
1146349cc55cSDimitry Andric     assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1147349cc55cSDimitry Andric   }
1148349cc55cSDimitry Andric 
1149349cc55cSDimitry Andric   static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1150349cc55cSDimitry Andric   static ScalarizationResult safe() { return {StatusTy::Safe}; }
1151349cc55cSDimitry Andric   static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1152349cc55cSDimitry Andric     return {StatusTy::SafeWithFreeze, ToFreeze};
1153349cc55cSDimitry Andric   }
1154349cc55cSDimitry Andric 
1155349cc55cSDimitry Andric   /// Returns true if the index can be scalarize without requiring a freeze.
1156349cc55cSDimitry Andric   bool isSafe() const { return Status == StatusTy::Safe; }
1157349cc55cSDimitry Andric   /// Returns true if the index cannot be scalarized.
1158349cc55cSDimitry Andric   bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1159349cc55cSDimitry Andric   /// Returns true if the index can be scalarize, but requires inserting a
1160349cc55cSDimitry Andric   /// freeze.
1161349cc55cSDimitry Andric   bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1162349cc55cSDimitry Andric 
1163349cc55cSDimitry Andric   /// Reset the state of Unsafe and clear ToFreze if set.
1164349cc55cSDimitry Andric   void discard() {
1165349cc55cSDimitry Andric     ToFreeze = nullptr;
1166349cc55cSDimitry Andric     Status = StatusTy::Unsafe;
1167349cc55cSDimitry Andric   }
1168349cc55cSDimitry Andric 
1169349cc55cSDimitry Andric   /// Freeze the ToFreeze and update the use in \p User to use it.
1170349cc55cSDimitry Andric   void freeze(IRBuilder<> &Builder, Instruction &UserI) {
1171349cc55cSDimitry Andric     assert(isSafeWithFreeze() &&
1172349cc55cSDimitry Andric            "should only be used when freezing is required");
1173349cc55cSDimitry Andric     assert(is_contained(ToFreeze->users(), &UserI) &&
1174349cc55cSDimitry Andric            "UserI must be a user of ToFreeze");
1175349cc55cSDimitry Andric     IRBuilder<>::InsertPointGuard Guard(Builder);
1176349cc55cSDimitry Andric     Builder.SetInsertPoint(cast<Instruction>(&UserI));
1177349cc55cSDimitry Andric     Value *Frozen =
1178349cc55cSDimitry Andric         Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1179349cc55cSDimitry Andric     for (Use &U : make_early_inc_range((UserI.operands())))
1180349cc55cSDimitry Andric       if (U.get() == ToFreeze)
1181349cc55cSDimitry Andric         U.set(Frozen);
1182349cc55cSDimitry Andric 
1183349cc55cSDimitry Andric     ToFreeze = nullptr;
1184349cc55cSDimitry Andric   }
1185349cc55cSDimitry Andric };
1186bdd1243dSDimitry Andric } // namespace
1187349cc55cSDimitry Andric 
1188fe6060f1SDimitry Andric /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1189fe6060f1SDimitry Andric /// Idx. \p Idx must access a valid vector element.
11905f757f3fSDimitry Andric static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
11915f757f3fSDimitry Andric                                               Instruction *CtxI,
1192349cc55cSDimitry Andric                                               AssumptionCache &AC,
1193349cc55cSDimitry Andric                                               const DominatorTree &DT) {
11945f757f3fSDimitry Andric   // We do checks for both fixed vector types and scalable vector types.
11955f757f3fSDimitry Andric   // This is the number of elements of fixed vector types,
11965f757f3fSDimitry Andric   // or the minimum number of elements of scalable vector types.
11975f757f3fSDimitry Andric   uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
11985f757f3fSDimitry Andric 
1199349cc55cSDimitry Andric   if (auto *C = dyn_cast<ConstantInt>(Idx)) {
12005f757f3fSDimitry Andric     if (C->getValue().ult(NumElements))
1201349cc55cSDimitry Andric       return ScalarizationResult::safe();
1202349cc55cSDimitry Andric     return ScalarizationResult::unsafe();
1203349cc55cSDimitry Andric   }
1204fe6060f1SDimitry Andric 
1205349cc55cSDimitry Andric   unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1206349cc55cSDimitry Andric   APInt Zero(IntWidth, 0);
12075f757f3fSDimitry Andric   APInt MaxElts(IntWidth, NumElements);
1208fe6060f1SDimitry Andric   ConstantRange ValidIndices(Zero, MaxElts);
1209349cc55cSDimitry Andric   ConstantRange IdxRange(IntWidth, true);
1210349cc55cSDimitry Andric 
1211349cc55cSDimitry Andric   if (isGuaranteedNotToBePoison(Idx, &AC)) {
121204eeddc0SDimitry Andric     if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
121304eeddc0SDimitry Andric                                                    true, &AC, CtxI, &DT)))
1214349cc55cSDimitry Andric       return ScalarizationResult::safe();
1215349cc55cSDimitry Andric     return ScalarizationResult::unsafe();
1216349cc55cSDimitry Andric   }
1217349cc55cSDimitry Andric 
1218349cc55cSDimitry Andric   // If the index may be poison, check if we can insert a freeze before the
1219349cc55cSDimitry Andric   // range of the index is restricted.
1220349cc55cSDimitry Andric   Value *IdxBase;
1221349cc55cSDimitry Andric   ConstantInt *CI;
1222349cc55cSDimitry Andric   if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1223349cc55cSDimitry Andric     IdxRange = IdxRange.binaryAnd(CI->getValue());
1224349cc55cSDimitry Andric   } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1225349cc55cSDimitry Andric     IdxRange = IdxRange.urem(CI->getValue());
1226349cc55cSDimitry Andric   }
1227349cc55cSDimitry Andric 
1228349cc55cSDimitry Andric   if (ValidIndices.contains(IdxRange))
1229349cc55cSDimitry Andric     return ScalarizationResult::safeWithFreeze(IdxBase);
1230349cc55cSDimitry Andric   return ScalarizationResult::unsafe();
1231fe6060f1SDimitry Andric }
1232fe6060f1SDimitry Andric 
1233fe6060f1SDimitry Andric /// The memory operation on a vector of \p ScalarType had alignment of
1234fe6060f1SDimitry Andric /// \p VectorAlignment. Compute the maximal, but conservatively correct,
1235fe6060f1SDimitry Andric /// alignment that will be valid for the memory operation on a single scalar
1236fe6060f1SDimitry Andric /// element of the same type with index \p Idx.
1237fe6060f1SDimitry Andric static Align computeAlignmentAfterScalarization(Align VectorAlignment,
1238fe6060f1SDimitry Andric                                                 Type *ScalarType, Value *Idx,
1239fe6060f1SDimitry Andric                                                 const DataLayout &DL) {
1240fe6060f1SDimitry Andric   if (auto *C = dyn_cast<ConstantInt>(Idx))
1241fe6060f1SDimitry Andric     return commonAlignment(VectorAlignment,
1242fe6060f1SDimitry Andric                            C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1243fe6060f1SDimitry Andric   return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1244fe6060f1SDimitry Andric }
1245fe6060f1SDimitry Andric 
1246fe6060f1SDimitry Andric // Combine patterns like:
1247fe6060f1SDimitry Andric //   %0 = load <4 x i32>, <4 x i32>* %a
1248fe6060f1SDimitry Andric //   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1249fe6060f1SDimitry Andric //   store <4 x i32> %1, <4 x i32>* %a
1250fe6060f1SDimitry Andric // to:
1251fe6060f1SDimitry Andric //   %0 = bitcast <4 x i32>* %a to i32*
1252fe6060f1SDimitry Andric //   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1253fe6060f1SDimitry Andric //   store i32 %b, i32* %1
1254fe6060f1SDimitry Andric bool VectorCombine::foldSingleElementStore(Instruction &I) {
1255bdd1243dSDimitry Andric   auto *SI = cast<StoreInst>(&I);
12565f757f3fSDimitry Andric   if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1257fe6060f1SDimitry Andric     return false;
1258fe6060f1SDimitry Andric 
1259fe6060f1SDimitry Andric   // TODO: Combine more complicated patterns (multiple insert) by referencing
1260fe6060f1SDimitry Andric   // TargetTransformInfo.
1261fe6060f1SDimitry Andric   Instruction *Source;
1262fe6060f1SDimitry Andric   Value *NewElement;
1263fe6060f1SDimitry Andric   Value *Idx;
1264fe6060f1SDimitry Andric   if (!match(SI->getValueOperand(),
1265fe6060f1SDimitry Andric              m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1266fe6060f1SDimitry Andric                          m_Value(Idx))))
1267fe6060f1SDimitry Andric     return false;
1268fe6060f1SDimitry Andric 
1269fe6060f1SDimitry Andric   if (auto *Load = dyn_cast<LoadInst>(Source)) {
12705f757f3fSDimitry Andric     auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1271fe6060f1SDimitry Andric     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1272fe6060f1SDimitry Andric     // Don't optimize for atomic/volatile load or store. Ensure memory is not
1273fe6060f1SDimitry Andric     // modified between, vector type matches store size, and index is inbounds.
1274fe6060f1SDimitry Andric     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
12750fca6ea1SDimitry Andric         !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1276349cc55cSDimitry Andric         SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1277349cc55cSDimitry Andric       return false;
1278349cc55cSDimitry Andric 
1279349cc55cSDimitry Andric     auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1280349cc55cSDimitry Andric     if (ScalarizableIdx.isUnsafe() ||
1281fe6060f1SDimitry Andric         isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1282fe6060f1SDimitry Andric                              MemoryLocation::get(SI), AA))
1283fe6060f1SDimitry Andric       return false;
1284fe6060f1SDimitry Andric 
1285349cc55cSDimitry Andric     if (ScalarizableIdx.isSafeWithFreeze())
1286349cc55cSDimitry Andric       ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1287fe6060f1SDimitry Andric     Value *GEP = Builder.CreateInBoundsGEP(
1288fe6060f1SDimitry Andric         SI->getValueOperand()->getType(), SI->getPointerOperand(),
1289fe6060f1SDimitry Andric         {ConstantInt::get(Idx->getType(), 0), Idx});
1290fe6060f1SDimitry Andric     StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1291fe6060f1SDimitry Andric     NSI->copyMetadata(*SI);
1292fe6060f1SDimitry Andric     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1293fe6060f1SDimitry Andric         std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
12940fca6ea1SDimitry Andric         *DL);
1295fe6060f1SDimitry Andric     NSI->setAlignment(ScalarOpAlignment);
1296fe6060f1SDimitry Andric     replaceValue(I, *NSI);
1297349cc55cSDimitry Andric     eraseInstruction(I);
1298fe6060f1SDimitry Andric     return true;
1299fe6060f1SDimitry Andric   }
1300fe6060f1SDimitry Andric 
1301fe6060f1SDimitry Andric   return false;
1302fe6060f1SDimitry Andric }
1303fe6060f1SDimitry Andric 
1304fe6060f1SDimitry Andric /// Try to scalarize vector loads feeding extractelement instructions.
1305fe6060f1SDimitry Andric bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1306fe6060f1SDimitry Andric   Value *Ptr;
1307349cc55cSDimitry Andric   if (!match(&I, m_Load(m_Value(Ptr))))
1308fe6060f1SDimitry Andric     return false;
1309fe6060f1SDimitry Andric 
13105f757f3fSDimitry Andric   auto *VecTy = cast<VectorType>(I.getType());
1311349cc55cSDimitry Andric   auto *LI = cast<LoadInst>(&I);
13120fca6ea1SDimitry Andric   if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1313fe6060f1SDimitry Andric     return false;
1314fe6060f1SDimitry Andric 
13150eae32dcSDimitry Andric   InstructionCost OriginalCost =
13165f757f3fSDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1317fe6060f1SDimitry Andric                           LI->getPointerAddressSpace());
1318fe6060f1SDimitry Andric   InstructionCost ScalarizedCost = 0;
1319fe6060f1SDimitry Andric 
1320fe6060f1SDimitry Andric   Instruction *LastCheckedInst = LI;
1321fe6060f1SDimitry Andric   unsigned NumInstChecked = 0;
13225f757f3fSDimitry Andric   DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
13235f757f3fSDimitry Andric   auto FailureGuard = make_scope_exit([&]() {
13245f757f3fSDimitry Andric     // If the transform is aborted, discard the ScalarizationResults.
13255f757f3fSDimitry Andric     for (auto &Pair : NeedFreeze)
13265f757f3fSDimitry Andric       Pair.second.discard();
13275f757f3fSDimitry Andric   });
13285f757f3fSDimitry Andric 
1329fe6060f1SDimitry Andric   // Check if all users of the load are extracts with no memory modifications
1330fe6060f1SDimitry Andric   // between the load and the extract. Compute the cost of both the original
1331fe6060f1SDimitry Andric   // code and the scalarized version.
1332fe6060f1SDimitry Andric   for (User *U : LI->users()) {
1333fe6060f1SDimitry Andric     auto *UI = dyn_cast<ExtractElementInst>(U);
1334fe6060f1SDimitry Andric     if (!UI || UI->getParent() != LI->getParent())
1335fe6060f1SDimitry Andric       return false;
1336fe6060f1SDimitry Andric 
1337fe6060f1SDimitry Andric     // Check if any instruction between the load and the extract may modify
1338fe6060f1SDimitry Andric     // memory.
1339fe6060f1SDimitry Andric     if (LastCheckedInst->comesBefore(UI)) {
1340fe6060f1SDimitry Andric       for (Instruction &I :
1341fe6060f1SDimitry Andric            make_range(std::next(LI->getIterator()), UI->getIterator())) {
1342fe6060f1SDimitry Andric         // Bail out if we reached the check limit or the instruction may write
1343fe6060f1SDimitry Andric         // to memory.
1344fe6060f1SDimitry Andric         if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1345fe6060f1SDimitry Andric           return false;
1346fe6060f1SDimitry Andric         NumInstChecked++;
1347fe6060f1SDimitry Andric       }
134881ad6265SDimitry Andric       LastCheckedInst = UI;
1349fe6060f1SDimitry Andric     }
1350fe6060f1SDimitry Andric 
13515f757f3fSDimitry Andric     auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
13525f757f3fSDimitry Andric     if (ScalarIdx.isUnsafe())
1353fe6060f1SDimitry Andric       return false;
13545f757f3fSDimitry Andric     if (ScalarIdx.isSafeWithFreeze()) {
13555f757f3fSDimitry Andric       NeedFreeze.try_emplace(UI, ScalarIdx);
13565f757f3fSDimitry Andric       ScalarIdx.discard();
1357349cc55cSDimitry Andric     }
1358fe6060f1SDimitry Andric 
1359fe6060f1SDimitry Andric     auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
1360bdd1243dSDimitry Andric     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1361fe6060f1SDimitry Andric     OriginalCost +=
13625f757f3fSDimitry Andric         TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1363fe6060f1SDimitry Andric                                Index ? Index->getZExtValue() : -1);
1364fe6060f1SDimitry Andric     ScalarizedCost +=
13655f757f3fSDimitry Andric         TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1366fe6060f1SDimitry Andric                             Align(1), LI->getPointerAddressSpace());
13675f757f3fSDimitry Andric     ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
1368fe6060f1SDimitry Andric   }
1369fe6060f1SDimitry Andric 
1370fe6060f1SDimitry Andric   if (ScalarizedCost >= OriginalCost)
1371fe6060f1SDimitry Andric     return false;
1372fe6060f1SDimitry Andric 
1373fe6060f1SDimitry Andric   // Replace extracts with narrow scalar loads.
1374fe6060f1SDimitry Andric   for (User *U : LI->users()) {
1375fe6060f1SDimitry Andric     auto *EI = cast<ExtractElementInst>(U);
1376fe6060f1SDimitry Andric     Value *Idx = EI->getOperand(1);
13775f757f3fSDimitry Andric 
13785f757f3fSDimitry Andric     // Insert 'freeze' for poison indexes.
13795f757f3fSDimitry Andric     auto It = NeedFreeze.find(EI);
13805f757f3fSDimitry Andric     if (It != NeedFreeze.end())
13815f757f3fSDimitry Andric       It->second.freeze(Builder, *cast<Instruction>(Idx));
13825f757f3fSDimitry Andric 
13835f757f3fSDimitry Andric     Builder.SetInsertPoint(EI);
1384fe6060f1SDimitry Andric     Value *GEP =
13855f757f3fSDimitry Andric         Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1386fe6060f1SDimitry Andric     auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
13875f757f3fSDimitry Andric         VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
1388fe6060f1SDimitry Andric 
1389fe6060f1SDimitry Andric     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
13900fca6ea1SDimitry Andric         LI->getAlign(), VecTy->getElementType(), Idx, *DL);
1391fe6060f1SDimitry Andric     NewLoad->setAlignment(ScalarOpAlignment);
1392fe6060f1SDimitry Andric 
1393fe6060f1SDimitry Andric     replaceValue(*EI, *NewLoad);
1394fe6060f1SDimitry Andric   }
1395fe6060f1SDimitry Andric 
13965f757f3fSDimitry Andric   FailureGuard.release();
1397fe6060f1SDimitry Andric   return true;
1398fe6060f1SDimitry Andric }
1399fe6060f1SDimitry Andric 
14000fca6ea1SDimitry Andric /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
1401349cc55cSDimitry Andric bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1402349cc55cSDimitry Andric   BinaryOperator *B0, *B1;
14030fca6ea1SDimitry Andric   ArrayRef<int> OldMask;
1404349cc55cSDimitry Andric   if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
14050fca6ea1SDimitry Andric                            m_Mask(OldMask))))
1406349cc55cSDimitry Andric     return false;
1407349cc55cSDimitry Andric 
14080fca6ea1SDimitry Andric   // Don't introduce poison into div/rem.
14090fca6ea1SDimitry Andric   if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) &&
14100fca6ea1SDimitry Andric       B0->isIntDivRem())
1411349cc55cSDimitry Andric     return false;
1412349cc55cSDimitry Andric 
14130fca6ea1SDimitry Andric   // TODO: Add support for addlike etc.
14140fca6ea1SDimitry Andric   Instruction::BinaryOps Opcode = B0->getOpcode();
14150fca6ea1SDimitry Andric   if (Opcode != B1->getOpcode())
14160fca6ea1SDimitry Andric     return false;
14170fca6ea1SDimitry Andric 
14180fca6ea1SDimitry Andric   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
14190fca6ea1SDimitry Andric   auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
14200fca6ea1SDimitry Andric   if (!ShuffleDstTy || !BinOpTy)
14210fca6ea1SDimitry Andric     return false;
14220fca6ea1SDimitry Andric 
14230fca6ea1SDimitry Andric   unsigned NumSrcElts = BinOpTy->getNumElements();
14240fca6ea1SDimitry Andric 
1425349cc55cSDimitry Andric   // If we have something like "add X, Y" and "add Z, X", swap ops to match.
1426349cc55cSDimitry Andric   Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
1427349cc55cSDimitry Andric   Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
14280fca6ea1SDimitry Andric   if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
14290fca6ea1SDimitry Andric       (X == W || Y == Z))
1430349cc55cSDimitry Andric     std::swap(X, Y);
1431349cc55cSDimitry Andric 
14320fca6ea1SDimitry Andric   auto ConvertToUnary = [NumSrcElts](int &M) {
14330fca6ea1SDimitry Andric     if (M >= (int)NumSrcElts)
14340fca6ea1SDimitry Andric       M -= NumSrcElts;
14350fca6ea1SDimitry Andric   };
14360fca6ea1SDimitry Andric 
14370fca6ea1SDimitry Andric   SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
14380fca6ea1SDimitry Andric   TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
1439349cc55cSDimitry Andric   if (X == Z) {
14400fca6ea1SDimitry Andric     llvm::for_each(NewMask0, ConvertToUnary);
14410fca6ea1SDimitry Andric     SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
14420fca6ea1SDimitry Andric     Z = PoisonValue::get(BinOpTy);
1443349cc55cSDimitry Andric   }
1444349cc55cSDimitry Andric 
14450fca6ea1SDimitry Andric   SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
14460fca6ea1SDimitry Andric   TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
14470fca6ea1SDimitry Andric   if (Y == W) {
14480fca6ea1SDimitry Andric     llvm::for_each(NewMask1, ConvertToUnary);
14490fca6ea1SDimitry Andric     SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
14500fca6ea1SDimitry Andric     W = PoisonValue::get(BinOpTy);
14510fca6ea1SDimitry Andric   }
14520fca6ea1SDimitry Andric 
14530fca6ea1SDimitry Andric   // Try to replace a binop with a shuffle if the shuffle is not costly.
14540fca6ea1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14550fca6ea1SDimitry Andric 
14560fca6ea1SDimitry Andric   InstructionCost OldCost =
14570fca6ea1SDimitry Andric       TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) +
14580fca6ea1SDimitry Andric       TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) +
14590fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
14600fca6ea1SDimitry Andric                          OldMask, CostKind, 0, nullptr, {B0, B1}, &I);
14610fca6ea1SDimitry Andric 
14620fca6ea1SDimitry Andric   InstructionCost NewCost =
14630fca6ea1SDimitry Andric       TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
14640fca6ea1SDimitry Andric       TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
14650fca6ea1SDimitry Andric       TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
14660fca6ea1SDimitry Andric 
14670fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
14680fca6ea1SDimitry Andric                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
14690fca6ea1SDimitry Andric                     << "\n");
14700fca6ea1SDimitry Andric   if (NewCost >= OldCost)
14710fca6ea1SDimitry Andric     return false;
14720fca6ea1SDimitry Andric 
14730fca6ea1SDimitry Andric   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
14740fca6ea1SDimitry Andric   Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
1475349cc55cSDimitry Andric   Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
14760fca6ea1SDimitry Andric 
1477349cc55cSDimitry Andric   // Intersect flags from the old binops.
1478349cc55cSDimitry Andric   if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
1479349cc55cSDimitry Andric     NewInst->copyIRFlags(B0);
1480349cc55cSDimitry Andric     NewInst->andIRFlags(B1);
1481349cc55cSDimitry Andric   }
14820fca6ea1SDimitry Andric 
14830fca6ea1SDimitry Andric   Worklist.pushValue(Shuf0);
14840fca6ea1SDimitry Andric   Worklist.pushValue(Shuf1);
1485349cc55cSDimitry Andric   replaceValue(I, *NewBO);
1486349cc55cSDimitry Andric   return true;
1487349cc55cSDimitry Andric }
1488349cc55cSDimitry Andric 
14890fca6ea1SDimitry Andric /// Try to convert "shuffle (castop), (castop)" with a shared castop operand
14900fca6ea1SDimitry Andric /// into "castop (shuffle)".
14910fca6ea1SDimitry Andric bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
14920fca6ea1SDimitry Andric   Value *V0, *V1;
14930fca6ea1SDimitry Andric   ArrayRef<int> OldMask;
14940fca6ea1SDimitry Andric   if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
14950fca6ea1SDimitry Andric     return false;
14960fca6ea1SDimitry Andric 
14970fca6ea1SDimitry Andric   auto *C0 = dyn_cast<CastInst>(V0);
14980fca6ea1SDimitry Andric   auto *C1 = dyn_cast<CastInst>(V1);
14990fca6ea1SDimitry Andric   if (!C0 || !C1)
15000fca6ea1SDimitry Andric     return false;
15010fca6ea1SDimitry Andric 
15020fca6ea1SDimitry Andric   Instruction::CastOps Opcode = C0->getOpcode();
15030fca6ea1SDimitry Andric   if (C0->getSrcTy() != C1->getSrcTy())
15040fca6ea1SDimitry Andric     return false;
15050fca6ea1SDimitry Andric 
15060fca6ea1SDimitry Andric   // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
15070fca6ea1SDimitry Andric   if (Opcode != C1->getOpcode()) {
15080fca6ea1SDimitry Andric     if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
15090fca6ea1SDimitry Andric       Opcode = Instruction::SExt;
15100fca6ea1SDimitry Andric     else
15110fca6ea1SDimitry Andric       return false;
15120fca6ea1SDimitry Andric   }
15130fca6ea1SDimitry Andric 
15140fca6ea1SDimitry Andric   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
15150fca6ea1SDimitry Andric   auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
15160fca6ea1SDimitry Andric   auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
15170fca6ea1SDimitry Andric   if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
15180fca6ea1SDimitry Andric     return false;
15190fca6ea1SDimitry Andric 
15200fca6ea1SDimitry Andric   unsigned NumSrcElts = CastSrcTy->getNumElements();
15210fca6ea1SDimitry Andric   unsigned NumDstElts = CastDstTy->getNumElements();
15220fca6ea1SDimitry Andric   assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
15230fca6ea1SDimitry Andric          "Only bitcasts expected to alter src/dst element counts");
15240fca6ea1SDimitry Andric 
15250fca6ea1SDimitry Andric   // Check for bitcasting of unscalable vector types.
15260fca6ea1SDimitry Andric   // e.g. <32 x i40> -> <40 x i32>
15270fca6ea1SDimitry Andric   if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
15280fca6ea1SDimitry Andric       (NumDstElts % NumSrcElts) != 0)
15290fca6ea1SDimitry Andric     return false;
15300fca6ea1SDimitry Andric 
15310fca6ea1SDimitry Andric   SmallVector<int, 16> NewMask;
15320fca6ea1SDimitry Andric   if (NumSrcElts >= NumDstElts) {
15330fca6ea1SDimitry Andric     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
15340fca6ea1SDimitry Andric     // always be expanded to the equivalent form choosing narrower elements.
15350fca6ea1SDimitry Andric     assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
15360fca6ea1SDimitry Andric     unsigned ScaleFactor = NumSrcElts / NumDstElts;
15370fca6ea1SDimitry Andric     narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
15380fca6ea1SDimitry Andric   } else {
15390fca6ea1SDimitry Andric     // The bitcast is from narrow elements to wide elements. The shuffle mask
15400fca6ea1SDimitry Andric     // must choose consecutive elements to allow casting first.
15410fca6ea1SDimitry Andric     assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
15420fca6ea1SDimitry Andric     unsigned ScaleFactor = NumDstElts / NumSrcElts;
15430fca6ea1SDimitry Andric     if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
15440fca6ea1SDimitry Andric       return false;
15450fca6ea1SDimitry Andric   }
15460fca6ea1SDimitry Andric 
15470fca6ea1SDimitry Andric   auto *NewShuffleDstTy =
15480fca6ea1SDimitry Andric       FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
15490fca6ea1SDimitry Andric 
15500fca6ea1SDimitry Andric   // Try to replace a castop with a shuffle if the shuffle is not costly.
15510fca6ea1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15520fca6ea1SDimitry Andric 
15530fca6ea1SDimitry Andric   InstructionCost CostC0 =
15540fca6ea1SDimitry Andric       TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
15550fca6ea1SDimitry Andric                            TTI::CastContextHint::None, CostKind);
15560fca6ea1SDimitry Andric   InstructionCost CostC1 =
15570fca6ea1SDimitry Andric       TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
15580fca6ea1SDimitry Andric                            TTI::CastContextHint::None, CostKind);
15590fca6ea1SDimitry Andric   InstructionCost OldCost = CostC0 + CostC1;
15600fca6ea1SDimitry Andric   OldCost +=
15610fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy,
15620fca6ea1SDimitry Andric                          OldMask, CostKind, 0, nullptr, std::nullopt, &I);
15630fca6ea1SDimitry Andric 
15640fca6ea1SDimitry Andric   InstructionCost NewCost = TTI.getShuffleCost(
15650fca6ea1SDimitry Andric       TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind);
15660fca6ea1SDimitry Andric   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
15670fca6ea1SDimitry Andric                                   TTI::CastContextHint::None, CostKind);
15680fca6ea1SDimitry Andric   if (!C0->hasOneUse())
15690fca6ea1SDimitry Andric     NewCost += CostC0;
15700fca6ea1SDimitry Andric   if (!C1->hasOneUse())
15710fca6ea1SDimitry Andric     NewCost += CostC1;
15720fca6ea1SDimitry Andric 
15730fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
15740fca6ea1SDimitry Andric                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
15750fca6ea1SDimitry Andric                     << "\n");
15760fca6ea1SDimitry Andric   if (NewCost > OldCost)
15770fca6ea1SDimitry Andric     return false;
15780fca6ea1SDimitry Andric 
15790fca6ea1SDimitry Andric   Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
15800fca6ea1SDimitry Andric                                             C1->getOperand(0), NewMask);
15810fca6ea1SDimitry Andric   Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
15820fca6ea1SDimitry Andric 
15830fca6ea1SDimitry Andric   // Intersect flags from the old casts.
15840fca6ea1SDimitry Andric   if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
15850fca6ea1SDimitry Andric     NewInst->copyIRFlags(C0);
15860fca6ea1SDimitry Andric     NewInst->andIRFlags(C1);
15870fca6ea1SDimitry Andric   }
15880fca6ea1SDimitry Andric 
15890fca6ea1SDimitry Andric   Worklist.pushValue(Shuf);
15900fca6ea1SDimitry Andric   replaceValue(I, *Cast);
15910fca6ea1SDimitry Andric   return true;
15920fca6ea1SDimitry Andric }
15930fca6ea1SDimitry Andric 
15940fca6ea1SDimitry Andric /// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
15950fca6ea1SDimitry Andric /// into "shuffle x, y".
15960fca6ea1SDimitry Andric bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
15970fca6ea1SDimitry Andric   Value *V0, *V1;
15980fca6ea1SDimitry Andric   UndefValue *U0, *U1;
15990fca6ea1SDimitry Andric   ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
16000fca6ea1SDimitry Andric   if (!match(&I, m_Shuffle(m_OneUse(m_Shuffle(m_Value(V0), m_UndefValue(U0),
16010fca6ea1SDimitry Andric                                               m_Mask(InnerMask0))),
16020fca6ea1SDimitry Andric                            m_OneUse(m_Shuffle(m_Value(V1), m_UndefValue(U1),
16030fca6ea1SDimitry Andric                                               m_Mask(InnerMask1))),
16040fca6ea1SDimitry Andric                            m_Mask(OuterMask))))
16050fca6ea1SDimitry Andric     return false;
16060fca6ea1SDimitry Andric 
16070fca6ea1SDimitry Andric   auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0));
16080fca6ea1SDimitry Andric   auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1));
16090fca6ea1SDimitry Andric   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
16100fca6ea1SDimitry Andric   auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
16110fca6ea1SDimitry Andric   auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
16120fca6ea1SDimitry Andric   if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
16130fca6ea1SDimitry Andric       V0->getType() != V1->getType())
16140fca6ea1SDimitry Andric     return false;
16150fca6ea1SDimitry Andric 
16160fca6ea1SDimitry Andric   unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
16170fca6ea1SDimitry Andric   unsigned NumImmElts = ShuffleImmTy->getNumElements();
16180fca6ea1SDimitry Andric 
16190fca6ea1SDimitry Andric   // Bail if either inner masks reference a RHS undef arg.
16200fca6ea1SDimitry Andric   if ((!isa<PoisonValue>(U0) &&
16210fca6ea1SDimitry Andric        any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) ||
16220fca6ea1SDimitry Andric       (!isa<PoisonValue>(U1) &&
16230fca6ea1SDimitry Andric        any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; })))
16240fca6ea1SDimitry Andric     return false;
16250fca6ea1SDimitry Andric 
16260fca6ea1SDimitry Andric   // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
16270fca6ea1SDimitry Andric   SmallVector<int, 16> NewMask(OuterMask.begin(), OuterMask.end());
16280fca6ea1SDimitry Andric   for (int &M : NewMask) {
16290fca6ea1SDimitry Andric     if (0 <= M && M < (int)NumImmElts) {
16300fca6ea1SDimitry Andric       M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M];
16310fca6ea1SDimitry Andric     } else if (M >= (int)NumImmElts) {
16320fca6ea1SDimitry Andric       if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts)
16330fca6ea1SDimitry Andric         M = PoisonMaskElem;
16340fca6ea1SDimitry Andric       else
16350fca6ea1SDimitry Andric         M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts);
16360fca6ea1SDimitry Andric     }
16370fca6ea1SDimitry Andric   }
16380fca6ea1SDimitry Andric 
16390fca6ea1SDimitry Andric   // Have we folded to an Identity shuffle?
16400fca6ea1SDimitry Andric   if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
16410fca6ea1SDimitry Andric     replaceValue(I, *V0);
16420fca6ea1SDimitry Andric     return true;
16430fca6ea1SDimitry Andric   }
16440fca6ea1SDimitry Andric 
16450fca6ea1SDimitry Andric   // Try to merge the shuffles if the new shuffle is not costly.
16460fca6ea1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16470fca6ea1SDimitry Andric 
16480fca6ea1SDimitry Andric   InstructionCost OldCost =
16490fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
16500fca6ea1SDimitry Andric                          InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) +
16510fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
16520fca6ea1SDimitry Andric                          InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) +
16530fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
16540fca6ea1SDimitry Andric                          OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I);
16550fca6ea1SDimitry Andric 
16560fca6ea1SDimitry Andric   InstructionCost NewCost =
16570fca6ea1SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy,
16580fca6ea1SDimitry Andric                          NewMask, CostKind, 0, nullptr, {V0, V1});
16590fca6ea1SDimitry Andric 
16600fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
16610fca6ea1SDimitry Andric                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
16620fca6ea1SDimitry Andric                     << "\n");
16630fca6ea1SDimitry Andric   if (NewCost > OldCost)
16640fca6ea1SDimitry Andric     return false;
16650fca6ea1SDimitry Andric 
16660fca6ea1SDimitry Andric   // Clear unused sources to poison.
16670fca6ea1SDimitry Andric   if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; }))
16680fca6ea1SDimitry Andric     V0 = PoisonValue::get(ShuffleSrcTy);
16690fca6ea1SDimitry Andric   if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; }))
16700fca6ea1SDimitry Andric     V1 = PoisonValue::get(ShuffleSrcTy);
16710fca6ea1SDimitry Andric 
16720fca6ea1SDimitry Andric   Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask);
16730fca6ea1SDimitry Andric   replaceValue(I, *Shuf);
16740fca6ea1SDimitry Andric   return true;
16750fca6ea1SDimitry Andric }
16760fca6ea1SDimitry Andric 
16770fca6ea1SDimitry Andric using InstLane = std::pair<Use *, int>;
16780fca6ea1SDimitry Andric 
16790fca6ea1SDimitry Andric static InstLane lookThroughShuffles(Use *U, int Lane) {
16800fca6ea1SDimitry Andric   while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
16810fca6ea1SDimitry Andric     unsigned NumElts =
16820fca6ea1SDimitry Andric         cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
16830fca6ea1SDimitry Andric     int M = SV->getMaskValue(Lane);
16840fca6ea1SDimitry Andric     if (M < 0)
16850fca6ea1SDimitry Andric       return {nullptr, PoisonMaskElem};
16860fca6ea1SDimitry Andric     if (static_cast<unsigned>(M) < NumElts) {
16870fca6ea1SDimitry Andric       U = &SV->getOperandUse(0);
16880fca6ea1SDimitry Andric       Lane = M;
16890fca6ea1SDimitry Andric     } else {
16900fca6ea1SDimitry Andric       U = &SV->getOperandUse(1);
16910fca6ea1SDimitry Andric       Lane = M - NumElts;
16920fca6ea1SDimitry Andric     }
16930fca6ea1SDimitry Andric   }
16940fca6ea1SDimitry Andric   return InstLane{U, Lane};
16950fca6ea1SDimitry Andric }
16960fca6ea1SDimitry Andric 
16970fca6ea1SDimitry Andric static SmallVector<InstLane>
16980fca6ea1SDimitry Andric generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
16990fca6ea1SDimitry Andric   SmallVector<InstLane> NItem;
17000fca6ea1SDimitry Andric   for (InstLane IL : Item) {
17010fca6ea1SDimitry Andric     auto [U, Lane] = IL;
17020fca6ea1SDimitry Andric     InstLane OpLane =
17030fca6ea1SDimitry Andric         U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
17040fca6ea1SDimitry Andric                                 Lane)
17050fca6ea1SDimitry Andric           : InstLane{nullptr, PoisonMaskElem};
17060fca6ea1SDimitry Andric     NItem.emplace_back(OpLane);
17070fca6ea1SDimitry Andric   }
17080fca6ea1SDimitry Andric   return NItem;
17090fca6ea1SDimitry Andric }
17100fca6ea1SDimitry Andric 
17110fca6ea1SDimitry Andric /// Detect concat of multiple values into a vector
17120fca6ea1SDimitry Andric static bool isFreeConcat(ArrayRef<InstLane> Item,
17130fca6ea1SDimitry Andric                          const TargetTransformInfo &TTI) {
17140fca6ea1SDimitry Andric   auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
17150fca6ea1SDimitry Andric   unsigned NumElts = Ty->getNumElements();
17160fca6ea1SDimitry Andric   if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
17170fca6ea1SDimitry Andric     return false;
17180fca6ea1SDimitry Andric 
17190fca6ea1SDimitry Andric   // Check that the concat is free, usually meaning that the type will be split
17200fca6ea1SDimitry Andric   // during legalization.
17210fca6ea1SDimitry Andric   SmallVector<int, 16> ConcatMask(NumElts * 2);
17220fca6ea1SDimitry Andric   std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
17230fca6ea1SDimitry Andric   if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
17240fca6ea1SDimitry Andric                          TTI::TCK_RecipThroughput) != 0)
17250fca6ea1SDimitry Andric     return false;
17260fca6ea1SDimitry Andric 
17270fca6ea1SDimitry Andric   unsigned NumSlices = Item.size() / NumElts;
17280fca6ea1SDimitry Andric   // Currently we generate a tree of shuffles for the concats, which limits us
17290fca6ea1SDimitry Andric   // to a power2.
17300fca6ea1SDimitry Andric   if (!isPowerOf2_32(NumSlices))
17310fca6ea1SDimitry Andric     return false;
17320fca6ea1SDimitry Andric   for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
17330fca6ea1SDimitry Andric     Use *SliceV = Item[Slice * NumElts].first;
17340fca6ea1SDimitry Andric     if (!SliceV || SliceV->get()->getType() != Ty)
17350fca6ea1SDimitry Andric       return false;
17360fca6ea1SDimitry Andric     for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
17370fca6ea1SDimitry Andric       auto [V, Lane] = Item[Slice * NumElts + Elt];
17380fca6ea1SDimitry Andric       if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
17390fca6ea1SDimitry Andric         return false;
17400fca6ea1SDimitry Andric     }
17410fca6ea1SDimitry Andric   }
17420fca6ea1SDimitry Andric   return true;
17430fca6ea1SDimitry Andric }
17440fca6ea1SDimitry Andric 
17450fca6ea1SDimitry Andric static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
17460fca6ea1SDimitry Andric                                   const SmallPtrSet<Use *, 4> &IdentityLeafs,
17470fca6ea1SDimitry Andric                                   const SmallPtrSet<Use *, 4> &SplatLeafs,
17480fca6ea1SDimitry Andric                                   const SmallPtrSet<Use *, 4> &ConcatLeafs,
17490fca6ea1SDimitry Andric                                   IRBuilder<> &Builder) {
17500fca6ea1SDimitry Andric   auto [FrontU, FrontLane] = Item.front();
17510fca6ea1SDimitry Andric 
17520fca6ea1SDimitry Andric   if (IdentityLeafs.contains(FrontU)) {
17530fca6ea1SDimitry Andric     return FrontU->get();
17540fca6ea1SDimitry Andric   }
17550fca6ea1SDimitry Andric   if (SplatLeafs.contains(FrontU)) {
17560fca6ea1SDimitry Andric     SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
17570fca6ea1SDimitry Andric     return Builder.CreateShuffleVector(FrontU->get(), Mask);
17580fca6ea1SDimitry Andric   }
17590fca6ea1SDimitry Andric   if (ConcatLeafs.contains(FrontU)) {
17600fca6ea1SDimitry Andric     unsigned NumElts =
17610fca6ea1SDimitry Andric         cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
17620fca6ea1SDimitry Andric     SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
17630fca6ea1SDimitry Andric     for (unsigned S = 0; S < Values.size(); ++S)
17640fca6ea1SDimitry Andric       Values[S] = Item[S * NumElts].first->get();
17650fca6ea1SDimitry Andric 
17660fca6ea1SDimitry Andric     while (Values.size() > 1) {
17670fca6ea1SDimitry Andric       NumElts *= 2;
17680fca6ea1SDimitry Andric       SmallVector<int, 16> Mask(NumElts, 0);
17690fca6ea1SDimitry Andric       std::iota(Mask.begin(), Mask.end(), 0);
17700fca6ea1SDimitry Andric       SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
17710fca6ea1SDimitry Andric       for (unsigned S = 0; S < NewValues.size(); ++S)
17720fca6ea1SDimitry Andric         NewValues[S] =
17730fca6ea1SDimitry Andric             Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
17740fca6ea1SDimitry Andric       Values = NewValues;
17750fca6ea1SDimitry Andric     }
17760fca6ea1SDimitry Andric     return Values[0];
17770fca6ea1SDimitry Andric   }
17780fca6ea1SDimitry Andric 
17790fca6ea1SDimitry Andric   auto *I = cast<Instruction>(FrontU->get());
17800fca6ea1SDimitry Andric   auto *II = dyn_cast<IntrinsicInst>(I);
17810fca6ea1SDimitry Andric   unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
17820fca6ea1SDimitry Andric   SmallVector<Value *> Ops(NumOps);
17830fca6ea1SDimitry Andric   for (unsigned Idx = 0; Idx < NumOps; Idx++) {
17840fca6ea1SDimitry Andric     if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
17850fca6ea1SDimitry Andric       Ops[Idx] = II->getOperand(Idx);
17860fca6ea1SDimitry Andric       continue;
17870fca6ea1SDimitry Andric     }
17880fca6ea1SDimitry Andric     Ops[Idx] =
17890fca6ea1SDimitry Andric         generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
17900fca6ea1SDimitry Andric                             IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
17910fca6ea1SDimitry Andric   }
17920fca6ea1SDimitry Andric 
17930fca6ea1SDimitry Andric   SmallVector<Value *, 8> ValueList;
17940fca6ea1SDimitry Andric   for (const auto &Lane : Item)
17950fca6ea1SDimitry Andric     if (Lane.first)
17960fca6ea1SDimitry Andric       ValueList.push_back(Lane.first->get());
17970fca6ea1SDimitry Andric 
17980fca6ea1SDimitry Andric   Type *DstTy =
17990fca6ea1SDimitry Andric       FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
18000fca6ea1SDimitry Andric   if (auto *BI = dyn_cast<BinaryOperator>(I)) {
18010fca6ea1SDimitry Andric     auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
18020fca6ea1SDimitry Andric                                       Ops[0], Ops[1]);
18030fca6ea1SDimitry Andric     propagateIRFlags(Value, ValueList);
18040fca6ea1SDimitry Andric     return Value;
18050fca6ea1SDimitry Andric   }
18060fca6ea1SDimitry Andric   if (auto *CI = dyn_cast<CmpInst>(I)) {
18070fca6ea1SDimitry Andric     auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
18080fca6ea1SDimitry Andric     propagateIRFlags(Value, ValueList);
18090fca6ea1SDimitry Andric     return Value;
18100fca6ea1SDimitry Andric   }
18110fca6ea1SDimitry Andric   if (auto *SI = dyn_cast<SelectInst>(I)) {
18120fca6ea1SDimitry Andric     auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
18130fca6ea1SDimitry Andric     propagateIRFlags(Value, ValueList);
18140fca6ea1SDimitry Andric     return Value;
18150fca6ea1SDimitry Andric   }
18160fca6ea1SDimitry Andric   if (auto *CI = dyn_cast<CastInst>(I)) {
18170fca6ea1SDimitry Andric     auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(),
18180fca6ea1SDimitry Andric                                      Ops[0], DstTy);
18190fca6ea1SDimitry Andric     propagateIRFlags(Value, ValueList);
18200fca6ea1SDimitry Andric     return Value;
18210fca6ea1SDimitry Andric   }
18220fca6ea1SDimitry Andric   if (II) {
18230fca6ea1SDimitry Andric     auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
18240fca6ea1SDimitry Andric     propagateIRFlags(Value, ValueList);
18250fca6ea1SDimitry Andric     return Value;
18260fca6ea1SDimitry Andric   }
18270fca6ea1SDimitry Andric   assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
18280fca6ea1SDimitry Andric   auto *Value =
18290fca6ea1SDimitry Andric       Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
18300fca6ea1SDimitry Andric   propagateIRFlags(Value, ValueList);
18310fca6ea1SDimitry Andric   return Value;
18320fca6ea1SDimitry Andric }
18330fca6ea1SDimitry Andric 
18340fca6ea1SDimitry Andric // Starting from a shuffle, look up through operands tracking the shuffled index
18350fca6ea1SDimitry Andric // of each lane. If we can simplify away the shuffles to identities then
18360fca6ea1SDimitry Andric // do so.
18370fca6ea1SDimitry Andric bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
18380fca6ea1SDimitry Andric   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
18390fca6ea1SDimitry Andric   if (!Ty || I.use_empty())
18400fca6ea1SDimitry Andric     return false;
18410fca6ea1SDimitry Andric 
18420fca6ea1SDimitry Andric   SmallVector<InstLane> Start(Ty->getNumElements());
18430fca6ea1SDimitry Andric   for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
18440fca6ea1SDimitry Andric     Start[M] = lookThroughShuffles(&*I.use_begin(), M);
18450fca6ea1SDimitry Andric 
18460fca6ea1SDimitry Andric   SmallVector<SmallVector<InstLane>> Worklist;
18470fca6ea1SDimitry Andric   Worklist.push_back(Start);
18480fca6ea1SDimitry Andric   SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
18490fca6ea1SDimitry Andric   unsigned NumVisited = 0;
18500fca6ea1SDimitry Andric 
18510fca6ea1SDimitry Andric   while (!Worklist.empty()) {
18520fca6ea1SDimitry Andric     if (++NumVisited > MaxInstrsToScan)
18530fca6ea1SDimitry Andric       return false;
18540fca6ea1SDimitry Andric 
18550fca6ea1SDimitry Andric     SmallVector<InstLane> Item = Worklist.pop_back_val();
18560fca6ea1SDimitry Andric     auto [FrontU, FrontLane] = Item.front();
18570fca6ea1SDimitry Andric 
18580fca6ea1SDimitry Andric     // If we found an undef first lane then bail out to keep things simple.
18590fca6ea1SDimitry Andric     if (!FrontU)
18600fca6ea1SDimitry Andric       return false;
18610fca6ea1SDimitry Andric 
18620fca6ea1SDimitry Andric     // Helper to peek through bitcasts to the same value.
18630fca6ea1SDimitry Andric     auto IsEquiv = [&](Value *X, Value *Y) {
18640fca6ea1SDimitry Andric       return X->getType() == Y->getType() &&
18650fca6ea1SDimitry Andric              peekThroughBitcasts(X) == peekThroughBitcasts(Y);
18660fca6ea1SDimitry Andric     };
18670fca6ea1SDimitry Andric 
18680fca6ea1SDimitry Andric     // Look for an identity value.
18690fca6ea1SDimitry Andric     if (FrontLane == 0 &&
18700fca6ea1SDimitry Andric         cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
18710fca6ea1SDimitry Andric             Ty->getNumElements() &&
18720fca6ea1SDimitry Andric         all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
18730fca6ea1SDimitry Andric           Value *FrontV = Item.front().first->get();
18740fca6ea1SDimitry Andric           return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
18750fca6ea1SDimitry Andric                                       E.value().second == (int)E.index());
18760fca6ea1SDimitry Andric         })) {
18770fca6ea1SDimitry Andric       IdentityLeafs.insert(FrontU);
18780fca6ea1SDimitry Andric       continue;
18790fca6ea1SDimitry Andric     }
18800fca6ea1SDimitry Andric     // Look for constants, for the moment only supporting constant splats.
18810fca6ea1SDimitry Andric     if (auto *C = dyn_cast<Constant>(FrontU);
18820fca6ea1SDimitry Andric         C && C->getSplatValue() &&
18830fca6ea1SDimitry Andric         all_of(drop_begin(Item), [Item](InstLane &IL) {
18840fca6ea1SDimitry Andric           Value *FrontV = Item.front().first->get();
18850fca6ea1SDimitry Andric           Use *U = IL.first;
18860fca6ea1SDimitry Andric           return !U || U->get() == FrontV;
18870fca6ea1SDimitry Andric         })) {
18880fca6ea1SDimitry Andric       SplatLeafs.insert(FrontU);
18890fca6ea1SDimitry Andric       continue;
18900fca6ea1SDimitry Andric     }
18910fca6ea1SDimitry Andric     // Look for a splat value.
18920fca6ea1SDimitry Andric     if (all_of(drop_begin(Item), [Item](InstLane &IL) {
18930fca6ea1SDimitry Andric           auto [FrontU, FrontLane] = Item.front();
18940fca6ea1SDimitry Andric           auto [U, Lane] = IL;
18950fca6ea1SDimitry Andric           return !U || (U->get() == FrontU->get() && Lane == FrontLane);
18960fca6ea1SDimitry Andric         })) {
18970fca6ea1SDimitry Andric       SplatLeafs.insert(FrontU);
18980fca6ea1SDimitry Andric       continue;
18990fca6ea1SDimitry Andric     }
19000fca6ea1SDimitry Andric 
19010fca6ea1SDimitry Andric     // We need each element to be the same type of value, and check that each
19020fca6ea1SDimitry Andric     // element has a single use.
1903*5deeebd8SDimitry Andric     auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
19040fca6ea1SDimitry Andric       Value *FrontV = Item.front().first->get();
19050fca6ea1SDimitry Andric       if (!IL.first)
19060fca6ea1SDimitry Andric         return true;
19070fca6ea1SDimitry Andric       Value *V = IL.first->get();
19080fca6ea1SDimitry Andric       if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse())
19090fca6ea1SDimitry Andric         return false;
19100fca6ea1SDimitry Andric       if (V->getValueID() != FrontV->getValueID())
19110fca6ea1SDimitry Andric         return false;
19120fca6ea1SDimitry Andric       if (auto *CI = dyn_cast<CmpInst>(V))
19130fca6ea1SDimitry Andric         if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
19140fca6ea1SDimitry Andric           return false;
19150fca6ea1SDimitry Andric       if (auto *CI = dyn_cast<CastInst>(V))
19160fca6ea1SDimitry Andric         if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy())
19170fca6ea1SDimitry Andric           return false;
19180fca6ea1SDimitry Andric       if (auto *SI = dyn_cast<SelectInst>(V))
19190fca6ea1SDimitry Andric         if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
19200fca6ea1SDimitry Andric             SI->getOperand(0)->getType() !=
19210fca6ea1SDimitry Andric                 cast<SelectInst>(FrontV)->getOperand(0)->getType())
19220fca6ea1SDimitry Andric           return false;
19230fca6ea1SDimitry Andric       if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
19240fca6ea1SDimitry Andric         return false;
19250fca6ea1SDimitry Andric       auto *II = dyn_cast<IntrinsicInst>(V);
19260fca6ea1SDimitry Andric       return !II || (isa<IntrinsicInst>(FrontV) &&
19270fca6ea1SDimitry Andric                      II->getIntrinsicID() ==
1928*5deeebd8SDimitry Andric                          cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
1929*5deeebd8SDimitry Andric                      !II->hasOperandBundles());
1930*5deeebd8SDimitry Andric     };
1931*5deeebd8SDimitry Andric     if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
19320fca6ea1SDimitry Andric       // Check the operator is one that we support.
19330fca6ea1SDimitry Andric       if (isa<BinaryOperator, CmpInst>(FrontU)) {
19340fca6ea1SDimitry Andric         //  We exclude div/rem in case they hit UB from poison lanes.
19350fca6ea1SDimitry Andric         if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
19360fca6ea1SDimitry Andric             BO && BO->isIntDivRem())
19370fca6ea1SDimitry Andric           return false;
19380fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
19390fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
19400fca6ea1SDimitry Andric         continue;
19410fca6ea1SDimitry Andric       } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
19420fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
19430fca6ea1SDimitry Andric         continue;
19440fca6ea1SDimitry Andric       } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
19450fca6ea1SDimitry Andric         // TODO: Handle vector widening/narrowing bitcasts.
19460fca6ea1SDimitry Andric         auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
19470fca6ea1SDimitry Andric         auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
19480fca6ea1SDimitry Andric         if (DstTy && SrcTy &&
19490fca6ea1SDimitry Andric             SrcTy->getNumElements() == DstTy->getNumElements()) {
19500fca6ea1SDimitry Andric           Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
19510fca6ea1SDimitry Andric           continue;
19520fca6ea1SDimitry Andric         }
19530fca6ea1SDimitry Andric       } else if (isa<SelectInst>(FrontU)) {
19540fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
19550fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
19560fca6ea1SDimitry Andric         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
19570fca6ea1SDimitry Andric         continue;
19580fca6ea1SDimitry Andric       } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
1959*5deeebd8SDimitry Andric                  II && isTriviallyVectorizable(II->getIntrinsicID()) &&
1960*5deeebd8SDimitry Andric                  !II->hasOperandBundles()) {
19610fca6ea1SDimitry Andric         for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
19620fca6ea1SDimitry Andric           if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
19630fca6ea1SDimitry Andric             if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
19640fca6ea1SDimitry Andric                   Value *FrontV = Item.front().first->get();
19650fca6ea1SDimitry Andric                   Use *U = IL.first;
19660fca6ea1SDimitry Andric                   return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
19670fca6ea1SDimitry Andric                                 cast<Instruction>(FrontV)->getOperand(Op));
19680fca6ea1SDimitry Andric                 }))
19690fca6ea1SDimitry Andric               return false;
19700fca6ea1SDimitry Andric             continue;
19710fca6ea1SDimitry Andric           }
19720fca6ea1SDimitry Andric           Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
19730fca6ea1SDimitry Andric         }
19740fca6ea1SDimitry Andric         continue;
19750fca6ea1SDimitry Andric       }
19760fca6ea1SDimitry Andric     }
19770fca6ea1SDimitry Andric 
19780fca6ea1SDimitry Andric     if (isFreeConcat(Item, TTI)) {
19790fca6ea1SDimitry Andric       ConcatLeafs.insert(FrontU);
19800fca6ea1SDimitry Andric       continue;
19810fca6ea1SDimitry Andric     }
19820fca6ea1SDimitry Andric 
19830fca6ea1SDimitry Andric     return false;
19840fca6ea1SDimitry Andric   }
19850fca6ea1SDimitry Andric 
19860fca6ea1SDimitry Andric   if (NumVisited <= 1)
19870fca6ea1SDimitry Andric     return false;
19880fca6ea1SDimitry Andric 
19890fca6ea1SDimitry Andric   // If we got this far, we know the shuffles are superfluous and can be
19900fca6ea1SDimitry Andric   // removed. Scan through again and generate the new tree of instructions.
19910fca6ea1SDimitry Andric   Builder.SetInsertPoint(&I);
19920fca6ea1SDimitry Andric   Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
19930fca6ea1SDimitry Andric                                  ConcatLeafs, Builder);
19940fca6ea1SDimitry Andric   replaceValue(I, *V);
19950fca6ea1SDimitry Andric   return true;
19960fca6ea1SDimitry Andric }
19970fca6ea1SDimitry Andric 
199881ad6265SDimitry Andric /// Given a commutative reduction, the order of the input lanes does not alter
199981ad6265SDimitry Andric /// the results. We can use this to remove certain shuffles feeding the
200081ad6265SDimitry Andric /// reduction, removing the need to shuffle at all.
200181ad6265SDimitry Andric bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
200281ad6265SDimitry Andric   auto *II = dyn_cast<IntrinsicInst>(&I);
200381ad6265SDimitry Andric   if (!II)
200481ad6265SDimitry Andric     return false;
200581ad6265SDimitry Andric   switch (II->getIntrinsicID()) {
200681ad6265SDimitry Andric   case Intrinsic::vector_reduce_add:
200781ad6265SDimitry Andric   case Intrinsic::vector_reduce_mul:
200881ad6265SDimitry Andric   case Intrinsic::vector_reduce_and:
200981ad6265SDimitry Andric   case Intrinsic::vector_reduce_or:
201081ad6265SDimitry Andric   case Intrinsic::vector_reduce_xor:
201181ad6265SDimitry Andric   case Intrinsic::vector_reduce_smin:
201281ad6265SDimitry Andric   case Intrinsic::vector_reduce_smax:
201381ad6265SDimitry Andric   case Intrinsic::vector_reduce_umin:
201481ad6265SDimitry Andric   case Intrinsic::vector_reduce_umax:
201581ad6265SDimitry Andric     break;
201681ad6265SDimitry Andric   default:
201781ad6265SDimitry Andric     return false;
201881ad6265SDimitry Andric   }
201981ad6265SDimitry Andric 
202081ad6265SDimitry Andric   // Find all the inputs when looking through operations that do not alter the
202181ad6265SDimitry Andric   // lane order (binops, for example). Currently we look for a single shuffle,
202281ad6265SDimitry Andric   // and can ignore splat values.
202381ad6265SDimitry Andric   std::queue<Value *> Worklist;
202481ad6265SDimitry Andric   SmallPtrSet<Value *, 4> Visited;
202581ad6265SDimitry Andric   ShuffleVectorInst *Shuffle = nullptr;
202681ad6265SDimitry Andric   if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
202781ad6265SDimitry Andric     Worklist.push(Op);
202881ad6265SDimitry Andric 
202981ad6265SDimitry Andric   while (!Worklist.empty()) {
203081ad6265SDimitry Andric     Value *CV = Worklist.front();
203181ad6265SDimitry Andric     Worklist.pop();
203281ad6265SDimitry Andric     if (Visited.contains(CV))
203381ad6265SDimitry Andric       continue;
203481ad6265SDimitry Andric 
203581ad6265SDimitry Andric     // Splats don't change the order, so can be safely ignored.
203681ad6265SDimitry Andric     if (isSplatValue(CV))
203781ad6265SDimitry Andric       continue;
203881ad6265SDimitry Andric 
203981ad6265SDimitry Andric     Visited.insert(CV);
204081ad6265SDimitry Andric 
204181ad6265SDimitry Andric     if (auto *CI = dyn_cast<Instruction>(CV)) {
204281ad6265SDimitry Andric       if (CI->isBinaryOp()) {
204381ad6265SDimitry Andric         for (auto *Op : CI->operand_values())
204481ad6265SDimitry Andric           Worklist.push(Op);
204581ad6265SDimitry Andric         continue;
204681ad6265SDimitry Andric       } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
204781ad6265SDimitry Andric         if (Shuffle && Shuffle != SV)
204881ad6265SDimitry Andric           return false;
204981ad6265SDimitry Andric         Shuffle = SV;
205081ad6265SDimitry Andric         continue;
205181ad6265SDimitry Andric       }
205281ad6265SDimitry Andric     }
205381ad6265SDimitry Andric 
205481ad6265SDimitry Andric     // Anything else is currently an unknown node.
205581ad6265SDimitry Andric     return false;
205681ad6265SDimitry Andric   }
205781ad6265SDimitry Andric 
205881ad6265SDimitry Andric   if (!Shuffle)
205981ad6265SDimitry Andric     return false;
206081ad6265SDimitry Andric 
206181ad6265SDimitry Andric   // Check all uses of the binary ops and shuffles are also included in the
206281ad6265SDimitry Andric   // lane-invariant operations (Visited should be the list of lanewise
206381ad6265SDimitry Andric   // instructions, including the shuffle that we found).
206481ad6265SDimitry Andric   for (auto *V : Visited)
206581ad6265SDimitry Andric     for (auto *U : V->users())
206681ad6265SDimitry Andric       if (!Visited.contains(U) && U != &I)
206781ad6265SDimitry Andric         return false;
206881ad6265SDimitry Andric 
206981ad6265SDimitry Andric   FixedVectorType *VecType =
207081ad6265SDimitry Andric       dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
207181ad6265SDimitry Andric   if (!VecType)
207281ad6265SDimitry Andric     return false;
207381ad6265SDimitry Andric   FixedVectorType *ShuffleInputType =
207481ad6265SDimitry Andric       dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
207581ad6265SDimitry Andric   if (!ShuffleInputType)
207681ad6265SDimitry Andric     return false;
20775f757f3fSDimitry Andric   unsigned NumInputElts = ShuffleInputType->getNumElements();
207881ad6265SDimitry Andric 
207981ad6265SDimitry Andric   // Find the mask from sorting the lanes into order. This is most likely to
208081ad6265SDimitry Andric   // become a identity or concat mask. Undef elements are pushed to the end.
208181ad6265SDimitry Andric   SmallVector<int> ConcatMask;
208281ad6265SDimitry Andric   Shuffle->getShuffleMask(ConcatMask);
208381ad6265SDimitry Andric   sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
20845f757f3fSDimitry Andric   // In the case of a truncating shuffle it's possible for the mask
20855f757f3fSDimitry Andric   // to have an index greater than the size of the resulting vector.
20865f757f3fSDimitry Andric   // This requires special handling.
20875f757f3fSDimitry Andric   bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
208881ad6265SDimitry Andric   bool UsesSecondVec =
20895f757f3fSDimitry Andric       any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
20905f757f3fSDimitry Andric 
20915f757f3fSDimitry Andric   FixedVectorType *VecTyForCost =
20925f757f3fSDimitry Andric       (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
209381ad6265SDimitry Andric   InstructionCost OldCost = TTI.getShuffleCost(
20945f757f3fSDimitry Andric       UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
20955f757f3fSDimitry Andric       VecTyForCost, Shuffle->getShuffleMask());
209681ad6265SDimitry Andric   InstructionCost NewCost = TTI.getShuffleCost(
20975f757f3fSDimitry Andric       UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
20985f757f3fSDimitry Andric       VecTyForCost, ConcatMask);
209981ad6265SDimitry Andric 
210081ad6265SDimitry Andric   LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
210181ad6265SDimitry Andric                     << "\n");
210281ad6265SDimitry Andric   LLVM_DEBUG(dbgs() << "  OldCost: " << OldCost << " vs NewCost: " << NewCost
210381ad6265SDimitry Andric                     << "\n");
210481ad6265SDimitry Andric   if (NewCost < OldCost) {
210581ad6265SDimitry Andric     Builder.SetInsertPoint(Shuffle);
210681ad6265SDimitry Andric     Value *NewShuffle = Builder.CreateShuffleVector(
210781ad6265SDimitry Andric         Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
210881ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
210981ad6265SDimitry Andric     replaceValue(*Shuffle, *NewShuffle);
211081ad6265SDimitry Andric   }
211181ad6265SDimitry Andric 
211281ad6265SDimitry Andric   // See if we can re-use foldSelectShuffle, getting it to reduce the size of
211381ad6265SDimitry Andric   // the shuffle into a nicer order, as it can ignore the order of the shuffles.
211481ad6265SDimitry Andric   return foldSelectShuffle(*Shuffle, true);
211581ad6265SDimitry Andric }
211681ad6265SDimitry Andric 
21170fca6ea1SDimitry Andric /// Determine if its more efficient to fold:
21180fca6ea1SDimitry Andric ///   reduce(trunc(x)) -> trunc(reduce(x)).
21190fca6ea1SDimitry Andric ///   reduce(sext(x))  -> sext(reduce(x)).
21200fca6ea1SDimitry Andric ///   reduce(zext(x))  -> zext(reduce(x)).
21210fca6ea1SDimitry Andric bool VectorCombine::foldCastFromReductions(Instruction &I) {
21220fca6ea1SDimitry Andric   auto *II = dyn_cast<IntrinsicInst>(&I);
21230fca6ea1SDimitry Andric   if (!II)
21240fca6ea1SDimitry Andric     return false;
21250fca6ea1SDimitry Andric 
21260fca6ea1SDimitry Andric   bool TruncOnly = false;
21270fca6ea1SDimitry Andric   Intrinsic::ID IID = II->getIntrinsicID();
21280fca6ea1SDimitry Andric   switch (IID) {
21290fca6ea1SDimitry Andric   case Intrinsic::vector_reduce_add:
21300fca6ea1SDimitry Andric   case Intrinsic::vector_reduce_mul:
21310fca6ea1SDimitry Andric     TruncOnly = true;
21320fca6ea1SDimitry Andric     break;
21330fca6ea1SDimitry Andric   case Intrinsic::vector_reduce_and:
21340fca6ea1SDimitry Andric   case Intrinsic::vector_reduce_or:
21350fca6ea1SDimitry Andric   case Intrinsic::vector_reduce_xor:
21360fca6ea1SDimitry Andric     break;
21370fca6ea1SDimitry Andric   default:
21380fca6ea1SDimitry Andric     return false;
21390fca6ea1SDimitry Andric   }
21400fca6ea1SDimitry Andric 
21410fca6ea1SDimitry Andric   unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
21420fca6ea1SDimitry Andric   Value *ReductionSrc = I.getOperand(0);
21430fca6ea1SDimitry Andric 
21440fca6ea1SDimitry Andric   Value *Src;
21450fca6ea1SDimitry Andric   if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
21460fca6ea1SDimitry Andric       (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
21470fca6ea1SDimitry Andric     return false;
21480fca6ea1SDimitry Andric 
21490fca6ea1SDimitry Andric   auto CastOpc =
21500fca6ea1SDimitry Andric       (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
21510fca6ea1SDimitry Andric 
21520fca6ea1SDimitry Andric   auto *SrcTy = cast<VectorType>(Src->getType());
21530fca6ea1SDimitry Andric   auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
21540fca6ea1SDimitry Andric   Type *ResultTy = I.getType();
21550fca6ea1SDimitry Andric 
21560fca6ea1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
21570fca6ea1SDimitry Andric   InstructionCost OldCost = TTI.getArithmeticReductionCost(
21580fca6ea1SDimitry Andric       ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
21590fca6ea1SDimitry Andric   OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
21600fca6ea1SDimitry Andric                                   TTI::CastContextHint::None, CostKind,
21610fca6ea1SDimitry Andric                                   cast<CastInst>(ReductionSrc));
21620fca6ea1SDimitry Andric   InstructionCost NewCost =
21630fca6ea1SDimitry Andric       TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
21640fca6ea1SDimitry Andric                                      CostKind) +
21650fca6ea1SDimitry Andric       TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
21660fca6ea1SDimitry Andric                            TTI::CastContextHint::None, CostKind);
21670fca6ea1SDimitry Andric 
21680fca6ea1SDimitry Andric   if (OldCost <= NewCost || !NewCost.isValid())
21690fca6ea1SDimitry Andric     return false;
21700fca6ea1SDimitry Andric 
21710fca6ea1SDimitry Andric   Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
21720fca6ea1SDimitry Andric                                                 II->getIntrinsicID(), {Src});
21730fca6ea1SDimitry Andric   Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
21740fca6ea1SDimitry Andric   replaceValue(I, *NewCast);
21750fca6ea1SDimitry Andric   return true;
21760fca6ea1SDimitry Andric }
21770fca6ea1SDimitry Andric 
217881ad6265SDimitry Andric /// This method looks for groups of shuffles acting on binops, of the form:
217981ad6265SDimitry Andric ///  %x = shuffle ...
218081ad6265SDimitry Andric ///  %y = shuffle ...
218181ad6265SDimitry Andric ///  %a = binop %x, %y
218281ad6265SDimitry Andric ///  %b = binop %x, %y
218381ad6265SDimitry Andric ///  shuffle %a, %b, selectmask
218481ad6265SDimitry Andric /// We may, especially if the shuffle is wider than legal, be able to convert
218581ad6265SDimitry Andric /// the shuffle to a form where only parts of a and b need to be computed. On
218681ad6265SDimitry Andric /// architectures with no obvious "select" shuffle, this can reduce the total
218781ad6265SDimitry Andric /// number of operations if the target reports them as cheaper.
218881ad6265SDimitry Andric bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
2189bdd1243dSDimitry Andric   auto *SVI = cast<ShuffleVectorInst>(&I);
2190bdd1243dSDimitry Andric   auto *VT = cast<FixedVectorType>(I.getType());
219181ad6265SDimitry Andric   auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
219281ad6265SDimitry Andric   auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
219381ad6265SDimitry Andric   if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
219481ad6265SDimitry Andric       VT != Op0->getType())
219581ad6265SDimitry Andric     return false;
2196bdd1243dSDimitry Andric 
2197753f127fSDimitry Andric   auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
2198753f127fSDimitry Andric   auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
2199753f127fSDimitry Andric   auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
2200753f127fSDimitry Andric   auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
2201753f127fSDimitry Andric   SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
220281ad6265SDimitry Andric   auto checkSVNonOpUses = [&](Instruction *I) {
220381ad6265SDimitry Andric     if (!I || I->getOperand(0)->getType() != VT)
220481ad6265SDimitry Andric       return true;
2205753f127fSDimitry Andric     return any_of(I->users(), [&](User *U) {
2206753f127fSDimitry Andric       return U != Op0 && U != Op1 &&
2207753f127fSDimitry Andric              !(isa<ShuffleVectorInst>(U) &&
2208753f127fSDimitry Andric                (InputShuffles.contains(cast<Instruction>(U)) ||
2209753f127fSDimitry Andric                 isInstructionTriviallyDead(cast<Instruction>(U))));
2210753f127fSDimitry Andric     });
221181ad6265SDimitry Andric   };
221281ad6265SDimitry Andric   if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
221381ad6265SDimitry Andric       checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
221481ad6265SDimitry Andric     return false;
221581ad6265SDimitry Andric 
221681ad6265SDimitry Andric   // Collect all the uses that are shuffles that we can transform together. We
221781ad6265SDimitry Andric   // may not have a single shuffle, but a group that can all be transformed
221881ad6265SDimitry Andric   // together profitably.
221981ad6265SDimitry Andric   SmallVector<ShuffleVectorInst *> Shuffles;
222081ad6265SDimitry Andric   auto collectShuffles = [&](Instruction *I) {
222181ad6265SDimitry Andric     for (auto *U : I->users()) {
222281ad6265SDimitry Andric       auto *SV = dyn_cast<ShuffleVectorInst>(U);
222381ad6265SDimitry Andric       if (!SV || SV->getType() != VT)
222481ad6265SDimitry Andric         return false;
2225753f127fSDimitry Andric       if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
2226753f127fSDimitry Andric           (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
2227753f127fSDimitry Andric         return false;
222881ad6265SDimitry Andric       if (!llvm::is_contained(Shuffles, SV))
222981ad6265SDimitry Andric         Shuffles.push_back(SV);
223081ad6265SDimitry Andric     }
223181ad6265SDimitry Andric     return true;
223281ad6265SDimitry Andric   };
223381ad6265SDimitry Andric   if (!collectShuffles(Op0) || !collectShuffles(Op1))
223481ad6265SDimitry Andric     return false;
223581ad6265SDimitry Andric   // From a reduction, we need to be processing a single shuffle, otherwise the
223681ad6265SDimitry Andric   // other uses will not be lane-invariant.
223781ad6265SDimitry Andric   if (FromReduction && Shuffles.size() > 1)
223881ad6265SDimitry Andric     return false;
223981ad6265SDimitry Andric 
2240753f127fSDimitry Andric   // Add any shuffle uses for the shuffles we have found, to include them in our
2241753f127fSDimitry Andric   // cost calculations.
2242753f127fSDimitry Andric   if (!FromReduction) {
2243753f127fSDimitry Andric     for (ShuffleVectorInst *SV : Shuffles) {
2244bdd1243dSDimitry Andric       for (auto *U : SV->users()) {
2245753f127fSDimitry Andric         ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
2246fcaf7f86SDimitry Andric         if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
2247753f127fSDimitry Andric           Shuffles.push_back(SSV);
2248753f127fSDimitry Andric       }
2249753f127fSDimitry Andric     }
2250753f127fSDimitry Andric   }
2251753f127fSDimitry Andric 
225281ad6265SDimitry Andric   // For each of the output shuffles, we try to sort all the first vector
225381ad6265SDimitry Andric   // elements to the beginning, followed by the second array elements at the
225481ad6265SDimitry Andric   // end. If the binops are legalized to smaller vectors, this may reduce total
225581ad6265SDimitry Andric   // number of binops. We compute the ReconstructMask mask needed to convert
225681ad6265SDimitry Andric   // back to the original lane order.
2257753f127fSDimitry Andric   SmallVector<std::pair<int, int>> V1, V2;
2258753f127fSDimitry Andric   SmallVector<SmallVector<int>> OrigReconstructMasks;
225981ad6265SDimitry Andric   int MaxV1Elt = 0, MaxV2Elt = 0;
226081ad6265SDimitry Andric   unsigned NumElts = VT->getNumElements();
226181ad6265SDimitry Andric   for (ShuffleVectorInst *SVN : Shuffles) {
226281ad6265SDimitry Andric     SmallVector<int> Mask;
226381ad6265SDimitry Andric     SVN->getShuffleMask(Mask);
226481ad6265SDimitry Andric 
226581ad6265SDimitry Andric     // Check the operands are the same as the original, or reversed (in which
226681ad6265SDimitry Andric     // case we need to commute the mask).
226781ad6265SDimitry Andric     Value *SVOp0 = SVN->getOperand(0);
226881ad6265SDimitry Andric     Value *SVOp1 = SVN->getOperand(1);
2269753f127fSDimitry Andric     if (isa<UndefValue>(SVOp1)) {
2270753f127fSDimitry Andric       auto *SSV = cast<ShuffleVectorInst>(SVOp0);
2271753f127fSDimitry Andric       SVOp0 = SSV->getOperand(0);
2272753f127fSDimitry Andric       SVOp1 = SSV->getOperand(1);
2273753f127fSDimitry Andric       for (unsigned I = 0, E = Mask.size(); I != E; I++) {
2274753f127fSDimitry Andric         if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size()))
2275753f127fSDimitry Andric           return false;
2276753f127fSDimitry Andric         Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]);
2277753f127fSDimitry Andric       }
2278753f127fSDimitry Andric     }
227981ad6265SDimitry Andric     if (SVOp0 == Op1 && SVOp1 == Op0) {
228081ad6265SDimitry Andric       std::swap(SVOp0, SVOp1);
228181ad6265SDimitry Andric       ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
228281ad6265SDimitry Andric     }
228381ad6265SDimitry Andric     if (SVOp0 != Op0 || SVOp1 != Op1)
228481ad6265SDimitry Andric       return false;
228581ad6265SDimitry Andric 
228681ad6265SDimitry Andric     // Calculate the reconstruction mask for this shuffle, as the mask needed to
228781ad6265SDimitry Andric     // take the packed values from Op0/Op1 and reconstructing to the original
228881ad6265SDimitry Andric     // order.
228981ad6265SDimitry Andric     SmallVector<int> ReconstructMask;
229081ad6265SDimitry Andric     for (unsigned I = 0; I < Mask.size(); I++) {
229181ad6265SDimitry Andric       if (Mask[I] < 0) {
229281ad6265SDimitry Andric         ReconstructMask.push_back(-1);
229381ad6265SDimitry Andric       } else if (Mask[I] < static_cast<int>(NumElts)) {
229481ad6265SDimitry Andric         MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
2295753f127fSDimitry Andric         auto It = find_if(V1, [&](const std::pair<int, int> &A) {
2296753f127fSDimitry Andric           return Mask[I] == A.first;
2297753f127fSDimitry Andric         });
229881ad6265SDimitry Andric         if (It != V1.end())
229981ad6265SDimitry Andric           ReconstructMask.push_back(It - V1.begin());
230081ad6265SDimitry Andric         else {
230181ad6265SDimitry Andric           ReconstructMask.push_back(V1.size());
2302753f127fSDimitry Andric           V1.emplace_back(Mask[I], V1.size());
230381ad6265SDimitry Andric         }
230481ad6265SDimitry Andric       } else {
230581ad6265SDimitry Andric         MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
2306753f127fSDimitry Andric         auto It = find_if(V2, [&](const std::pair<int, int> &A) {
2307753f127fSDimitry Andric           return Mask[I] - static_cast<int>(NumElts) == A.first;
2308753f127fSDimitry Andric         });
230981ad6265SDimitry Andric         if (It != V2.end())
231081ad6265SDimitry Andric           ReconstructMask.push_back(NumElts + It - V2.begin());
231181ad6265SDimitry Andric         else {
231281ad6265SDimitry Andric           ReconstructMask.push_back(NumElts + V2.size());
2313753f127fSDimitry Andric           V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
231481ad6265SDimitry Andric         }
231581ad6265SDimitry Andric       }
231681ad6265SDimitry Andric     }
231781ad6265SDimitry Andric 
231881ad6265SDimitry Andric     // For reductions, we know that the lane ordering out doesn't alter the
231981ad6265SDimitry Andric     // result. In-order can help simplify the shuffle away.
232081ad6265SDimitry Andric     if (FromReduction)
232181ad6265SDimitry Andric       sort(ReconstructMask);
2322753f127fSDimitry Andric     OrigReconstructMasks.push_back(std::move(ReconstructMask));
232381ad6265SDimitry Andric   }
232481ad6265SDimitry Andric 
232581ad6265SDimitry Andric   // If the Maximum element used from V1 and V2 are not larger than the new
232681ad6265SDimitry Andric   // vectors, the vectors are already packes and performing the optimization
232781ad6265SDimitry Andric   // again will likely not help any further. This also prevents us from getting
232881ad6265SDimitry Andric   // stuck in a cycle in case the costs do not also rule it out.
232981ad6265SDimitry Andric   if (V1.empty() || V2.empty() ||
233081ad6265SDimitry Andric       (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
233181ad6265SDimitry Andric        MaxV2Elt == static_cast<int>(V2.size()) - 1))
233281ad6265SDimitry Andric     return false;
233381ad6265SDimitry Andric 
2334753f127fSDimitry Andric   // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
2335753f127fSDimitry Andric   // shuffle of another shuffle, or not a shuffle (that is treated like a
2336753f127fSDimitry Andric   // identity shuffle).
2337753f127fSDimitry Andric   auto GetBaseMaskValue = [&](Instruction *I, int M) {
2338753f127fSDimitry Andric     auto *SV = dyn_cast<ShuffleVectorInst>(I);
2339753f127fSDimitry Andric     if (!SV)
2340753f127fSDimitry Andric       return M;
2341753f127fSDimitry Andric     if (isa<UndefValue>(SV->getOperand(1)))
2342753f127fSDimitry Andric       if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
2343753f127fSDimitry Andric         if (InputShuffles.contains(SSV))
2344753f127fSDimitry Andric           return SSV->getMaskValue(SV->getMaskValue(M));
2345753f127fSDimitry Andric     return SV->getMaskValue(M);
2346753f127fSDimitry Andric   };
2347753f127fSDimitry Andric 
2348753f127fSDimitry Andric   // Attempt to sort the inputs my ascending mask values to make simpler input
2349753f127fSDimitry Andric   // shuffles and push complex shuffles down to the uses. We sort on the first
2350753f127fSDimitry Andric   // of the two input shuffle orders, to try and get at least one input into a
2351753f127fSDimitry Andric   // nice order.
2352753f127fSDimitry Andric   auto SortBase = [&](Instruction *A, std::pair<int, int> X,
2353753f127fSDimitry Andric                       std::pair<int, int> Y) {
2354753f127fSDimitry Andric     int MXA = GetBaseMaskValue(A, X.first);
2355753f127fSDimitry Andric     int MYA = GetBaseMaskValue(A, Y.first);
2356753f127fSDimitry Andric     return MXA < MYA;
2357753f127fSDimitry Andric   };
2358753f127fSDimitry Andric   stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
2359753f127fSDimitry Andric     return SortBase(SVI0A, A, B);
2360753f127fSDimitry Andric   });
2361753f127fSDimitry Andric   stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
2362753f127fSDimitry Andric     return SortBase(SVI1A, A, B);
2363753f127fSDimitry Andric   });
2364753f127fSDimitry Andric   // Calculate our ReconstructMasks from the OrigReconstructMasks and the
2365753f127fSDimitry Andric   // modified order of the input shuffles.
2366753f127fSDimitry Andric   SmallVector<SmallVector<int>> ReconstructMasks;
236706c3fb27SDimitry Andric   for (const auto &Mask : OrigReconstructMasks) {
2368753f127fSDimitry Andric     SmallVector<int> ReconstructMask;
2369753f127fSDimitry Andric     for (int M : Mask) {
2370753f127fSDimitry Andric       auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
2371753f127fSDimitry Andric         auto It = find_if(V, [M](auto A) { return A.second == M; });
2372753f127fSDimitry Andric         assert(It != V.end() && "Expected all entries in Mask");
2373753f127fSDimitry Andric         return std::distance(V.begin(), It);
2374753f127fSDimitry Andric       };
2375753f127fSDimitry Andric       if (M < 0)
2376753f127fSDimitry Andric         ReconstructMask.push_back(-1);
2377753f127fSDimitry Andric       else if (M < static_cast<int>(NumElts)) {
2378753f127fSDimitry Andric         ReconstructMask.push_back(FindIndex(V1, M));
2379753f127fSDimitry Andric       } else {
2380753f127fSDimitry Andric         ReconstructMask.push_back(NumElts + FindIndex(V2, M));
2381753f127fSDimitry Andric       }
2382753f127fSDimitry Andric     }
2383753f127fSDimitry Andric     ReconstructMasks.push_back(std::move(ReconstructMask));
2384753f127fSDimitry Andric   }
2385753f127fSDimitry Andric 
238681ad6265SDimitry Andric   // Calculate the masks needed for the new input shuffles, which get padded
238781ad6265SDimitry Andric   // with undef
238881ad6265SDimitry Andric   SmallVector<int> V1A, V1B, V2A, V2B;
238981ad6265SDimitry Andric   for (unsigned I = 0; I < V1.size(); I++) {
2390753f127fSDimitry Andric     V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
2391753f127fSDimitry Andric     V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
239281ad6265SDimitry Andric   }
239381ad6265SDimitry Andric   for (unsigned I = 0; I < V2.size(); I++) {
2394753f127fSDimitry Andric     V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
2395753f127fSDimitry Andric     V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
239681ad6265SDimitry Andric   }
239781ad6265SDimitry Andric   while (V1A.size() < NumElts) {
239806c3fb27SDimitry Andric     V1A.push_back(PoisonMaskElem);
239906c3fb27SDimitry Andric     V1B.push_back(PoisonMaskElem);
240081ad6265SDimitry Andric   }
240181ad6265SDimitry Andric   while (V2A.size() < NumElts) {
240206c3fb27SDimitry Andric     V2A.push_back(PoisonMaskElem);
240306c3fb27SDimitry Andric     V2B.push_back(PoisonMaskElem);
240481ad6265SDimitry Andric   }
240581ad6265SDimitry Andric 
2406753f127fSDimitry Andric   auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
2407753f127fSDimitry Andric     auto *SV = dyn_cast<ShuffleVectorInst>(I);
2408753f127fSDimitry Andric     if (!SV)
2409753f127fSDimitry Andric       return C;
2410753f127fSDimitry Andric     return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
2411753f127fSDimitry Andric                                       ? TTI::SK_PermuteSingleSrc
2412753f127fSDimitry Andric                                       : TTI::SK_PermuteTwoSrc,
2413753f127fSDimitry Andric                                   VT, SV->getShuffleMask());
241481ad6265SDimitry Andric   };
241581ad6265SDimitry Andric   auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
241681ad6265SDimitry Andric     return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
241781ad6265SDimitry Andric   };
241881ad6265SDimitry Andric 
241981ad6265SDimitry Andric   // Get the costs of the shuffles + binops before and after with the new
242081ad6265SDimitry Andric   // shuffle masks.
242181ad6265SDimitry Andric   InstructionCost CostBefore =
242281ad6265SDimitry Andric       TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
242381ad6265SDimitry Andric       TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
242481ad6265SDimitry Andric   CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
242581ad6265SDimitry Andric                                 InstructionCost(0), AddShuffleCost);
242681ad6265SDimitry Andric   CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
242781ad6265SDimitry Andric                                 InstructionCost(0), AddShuffleCost);
242881ad6265SDimitry Andric 
242981ad6265SDimitry Andric   // The new binops will be unused for lanes past the used shuffle lengths.
243081ad6265SDimitry Andric   // These types attempt to get the correct cost for that from the target.
243181ad6265SDimitry Andric   FixedVectorType *Op0SmallVT =
243281ad6265SDimitry Andric       FixedVectorType::get(VT->getScalarType(), V1.size());
243381ad6265SDimitry Andric   FixedVectorType *Op1SmallVT =
243481ad6265SDimitry Andric       FixedVectorType::get(VT->getScalarType(), V2.size());
243581ad6265SDimitry Andric   InstructionCost CostAfter =
243681ad6265SDimitry Andric       TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
243781ad6265SDimitry Andric       TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
243881ad6265SDimitry Andric   CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
243981ad6265SDimitry Andric                                InstructionCost(0), AddShuffleMaskCost);
244081ad6265SDimitry Andric   std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
244181ad6265SDimitry Andric   CostAfter +=
244281ad6265SDimitry Andric       std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
244381ad6265SDimitry Andric                       InstructionCost(0), AddShuffleMaskCost);
244481ad6265SDimitry Andric 
2445753f127fSDimitry Andric   LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
2446753f127fSDimitry Andric   LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
2447753f127fSDimitry Andric                     << " vs CostAfter: " << CostAfter << "\n");
244881ad6265SDimitry Andric   if (CostBefore <= CostAfter)
244981ad6265SDimitry Andric     return false;
245081ad6265SDimitry Andric 
245181ad6265SDimitry Andric   // The cost model has passed, create the new instructions.
2452753f127fSDimitry Andric   auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
2453753f127fSDimitry Andric     auto *SV = dyn_cast<ShuffleVectorInst>(I);
2454753f127fSDimitry Andric     if (!SV)
2455753f127fSDimitry Andric       return I;
2456753f127fSDimitry Andric     if (isa<UndefValue>(SV->getOperand(1)))
2457753f127fSDimitry Andric       if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
2458753f127fSDimitry Andric         if (InputShuffles.contains(SSV))
2459753f127fSDimitry Andric           return SSV->getOperand(Op);
2460753f127fSDimitry Andric     return SV->getOperand(Op);
2461753f127fSDimitry Andric   };
24625f757f3fSDimitry Andric   Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
2463753f127fSDimitry Andric   Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
2464753f127fSDimitry Andric                                              GetShuffleOperand(SVI0A, 1), V1A);
24655f757f3fSDimitry Andric   Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
2466753f127fSDimitry Andric   Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
2467753f127fSDimitry Andric                                              GetShuffleOperand(SVI0B, 1), V1B);
24685f757f3fSDimitry Andric   Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
2469753f127fSDimitry Andric   Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
2470753f127fSDimitry Andric                                              GetShuffleOperand(SVI1A, 1), V2A);
24715f757f3fSDimitry Andric   Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
2472753f127fSDimitry Andric   Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
2473753f127fSDimitry Andric                                              GetShuffleOperand(SVI1B, 1), V2B);
247481ad6265SDimitry Andric   Builder.SetInsertPoint(Op0);
247581ad6265SDimitry Andric   Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
247681ad6265SDimitry Andric                                     NSV0A, NSV0B);
247781ad6265SDimitry Andric   if (auto *I = dyn_cast<Instruction>(NOp0))
247881ad6265SDimitry Andric     I->copyIRFlags(Op0, true);
247981ad6265SDimitry Andric   Builder.SetInsertPoint(Op1);
248081ad6265SDimitry Andric   Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
248181ad6265SDimitry Andric                                     NSV1A, NSV1B);
248281ad6265SDimitry Andric   if (auto *I = dyn_cast<Instruction>(NOp1))
248381ad6265SDimitry Andric     I->copyIRFlags(Op1, true);
248481ad6265SDimitry Andric 
248581ad6265SDimitry Andric   for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
248681ad6265SDimitry Andric     Builder.SetInsertPoint(Shuffles[S]);
248781ad6265SDimitry Andric     Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
248881ad6265SDimitry Andric     replaceValue(*Shuffles[S], *NSV);
248981ad6265SDimitry Andric   }
249081ad6265SDimitry Andric 
249181ad6265SDimitry Andric   Worklist.pushValue(NSV0A);
249281ad6265SDimitry Andric   Worklist.pushValue(NSV0B);
249381ad6265SDimitry Andric   Worklist.pushValue(NSV1A);
249481ad6265SDimitry Andric   Worklist.pushValue(NSV1B);
249581ad6265SDimitry Andric   for (auto *S : Shuffles)
249681ad6265SDimitry Andric     Worklist.add(S);
249781ad6265SDimitry Andric   return true;
249881ad6265SDimitry Andric }
249981ad6265SDimitry Andric 
25005ffd83dbSDimitry Andric /// This is the entry point for all transforms. Pass manager differences are
25015ffd83dbSDimitry Andric /// handled in the callers of this function.
25025ffd83dbSDimitry Andric bool VectorCombine::run() {
25035ffd83dbSDimitry Andric   if (DisableVectorCombine)
25045ffd83dbSDimitry Andric     return false;
25055ffd83dbSDimitry Andric 
2506e8d8bef9SDimitry Andric   // Don't attempt vectorization if the target does not support vectors.
2507e8d8bef9SDimitry Andric   if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
2508e8d8bef9SDimitry Andric     return false;
2509e8d8bef9SDimitry Andric 
25105ffd83dbSDimitry Andric   bool MadeChange = false;
2511349cc55cSDimitry Andric   auto FoldInst = [this, &MadeChange](Instruction &I) {
2512349cc55cSDimitry Andric     Builder.SetInsertPoint(&I);
2513bdd1243dSDimitry Andric     bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
2514bdd1243dSDimitry Andric     auto Opcode = I.getOpcode();
2515bdd1243dSDimitry Andric 
2516bdd1243dSDimitry Andric     // These folds should be beneficial regardless of when this pass is run
2517bdd1243dSDimitry Andric     // in the optimization pipeline.
2518bdd1243dSDimitry Andric     // The type checking is for run-time efficiency. We can avoid wasting time
2519bdd1243dSDimitry Andric     // dispatching to folding functions if there's no chance of matching.
2520bdd1243dSDimitry Andric     if (IsFixedVectorType) {
2521bdd1243dSDimitry Andric       switch (Opcode) {
2522bdd1243dSDimitry Andric       case Instruction::InsertElement:
2523349cc55cSDimitry Andric         MadeChange |= vectorizeLoadInsert(I);
2524bdd1243dSDimitry Andric         break;
2525bdd1243dSDimitry Andric       case Instruction::ShuffleVector:
2526bdd1243dSDimitry Andric         MadeChange |= widenSubvectorLoad(I);
2527bdd1243dSDimitry Andric         break;
2528bdd1243dSDimitry Andric       default:
2529bdd1243dSDimitry Andric         break;
2530bdd1243dSDimitry Andric       }
2531bdd1243dSDimitry Andric     }
2532bdd1243dSDimitry Andric 
2533bdd1243dSDimitry Andric     // This transform works with scalable and fixed vectors
2534bdd1243dSDimitry Andric     // TODO: Identify and allow other scalable transforms
25355f757f3fSDimitry Andric     if (isa<VectorType>(I.getType())) {
2536bdd1243dSDimitry Andric       MadeChange |= scalarizeBinopOrCmp(I);
25375f757f3fSDimitry Andric       MadeChange |= scalarizeLoadExtract(I);
25385f757f3fSDimitry Andric       MadeChange |= scalarizeVPIntrinsic(I);
25395f757f3fSDimitry Andric     }
2540bdd1243dSDimitry Andric 
2541bdd1243dSDimitry Andric     if (Opcode == Instruction::Store)
2542349cc55cSDimitry Andric       MadeChange |= foldSingleElementStore(I);
2543bdd1243dSDimitry Andric 
2544bdd1243dSDimitry Andric     // If this is an early pipeline invocation of this pass, we are done.
2545bdd1243dSDimitry Andric     if (TryEarlyFoldsOnly)
2546bdd1243dSDimitry Andric       return;
2547bdd1243dSDimitry Andric 
2548bdd1243dSDimitry Andric     // Otherwise, try folds that improve codegen but may interfere with
2549bdd1243dSDimitry Andric     // early IR canonicalizations.
2550bdd1243dSDimitry Andric     // The type checking is for run-time efficiency. We can avoid wasting time
2551bdd1243dSDimitry Andric     // dispatching to folding functions if there's no chance of matching.
2552bdd1243dSDimitry Andric     if (IsFixedVectorType) {
2553bdd1243dSDimitry Andric       switch (Opcode) {
2554bdd1243dSDimitry Andric       case Instruction::InsertElement:
2555bdd1243dSDimitry Andric         MadeChange |= foldInsExtFNeg(I);
2556bdd1243dSDimitry Andric         break;
2557bdd1243dSDimitry Andric       case Instruction::ShuffleVector:
2558bdd1243dSDimitry Andric         MadeChange |= foldShuffleOfBinops(I);
25590fca6ea1SDimitry Andric         MadeChange |= foldShuffleOfCastops(I);
25600fca6ea1SDimitry Andric         MadeChange |= foldShuffleOfShuffles(I);
2561bdd1243dSDimitry Andric         MadeChange |= foldSelectShuffle(I);
25620fca6ea1SDimitry Andric         MadeChange |= foldShuffleToIdentity(I);
2563bdd1243dSDimitry Andric         break;
2564bdd1243dSDimitry Andric       case Instruction::BitCast:
25655f757f3fSDimitry Andric         MadeChange |= foldBitcastShuffle(I);
2566bdd1243dSDimitry Andric         break;
2567bdd1243dSDimitry Andric       }
2568bdd1243dSDimitry Andric     } else {
2569bdd1243dSDimitry Andric       switch (Opcode) {
2570bdd1243dSDimitry Andric       case Instruction::Call:
2571bdd1243dSDimitry Andric         MadeChange |= foldShuffleFromReductions(I);
25720fca6ea1SDimitry Andric         MadeChange |= foldCastFromReductions(I);
2573bdd1243dSDimitry Andric         break;
2574bdd1243dSDimitry Andric       case Instruction::ICmp:
2575bdd1243dSDimitry Andric       case Instruction::FCmp:
2576bdd1243dSDimitry Andric         MadeChange |= foldExtractExtract(I);
2577bdd1243dSDimitry Andric         break;
2578bdd1243dSDimitry Andric       default:
2579bdd1243dSDimitry Andric         if (Instruction::isBinaryOp(Opcode)) {
2580bdd1243dSDimitry Andric           MadeChange |= foldExtractExtract(I);
2581bdd1243dSDimitry Andric           MadeChange |= foldExtractedCmps(I);
2582bdd1243dSDimitry Andric         }
2583bdd1243dSDimitry Andric         break;
2584bdd1243dSDimitry Andric       }
2585bdd1243dSDimitry Andric     }
2586349cc55cSDimitry Andric   };
2587bdd1243dSDimitry Andric 
25885ffd83dbSDimitry Andric   for (BasicBlock &BB : F) {
25895ffd83dbSDimitry Andric     // Ignore unreachable basic blocks.
25905ffd83dbSDimitry Andric     if (!DT.isReachableFromEntry(&BB))
25915ffd83dbSDimitry Andric       continue;
2592fe6060f1SDimitry Andric     // Use early increment range so that we can erase instructions in loop.
2593fe6060f1SDimitry Andric     for (Instruction &I : make_early_inc_range(BB)) {
2594349cc55cSDimitry Andric       if (I.isDebugOrPseudoInst())
25955ffd83dbSDimitry Andric         continue;
2596349cc55cSDimitry Andric       FoldInst(I);
25975ffd83dbSDimitry Andric     }
25985ffd83dbSDimitry Andric   }
25995ffd83dbSDimitry Andric 
2600349cc55cSDimitry Andric   while (!Worklist.isEmpty()) {
2601349cc55cSDimitry Andric     Instruction *I = Worklist.removeOne();
2602349cc55cSDimitry Andric     if (!I)
2603349cc55cSDimitry Andric       continue;
2604349cc55cSDimitry Andric 
2605349cc55cSDimitry Andric     if (isInstructionTriviallyDead(I)) {
2606349cc55cSDimitry Andric       eraseInstruction(*I);
2607349cc55cSDimitry Andric       continue;
2608349cc55cSDimitry Andric     }
2609349cc55cSDimitry Andric 
2610349cc55cSDimitry Andric     FoldInst(*I);
2611349cc55cSDimitry Andric   }
26125ffd83dbSDimitry Andric 
26135ffd83dbSDimitry Andric   return MadeChange;
26145ffd83dbSDimitry Andric }
26155ffd83dbSDimitry Andric 
26165ffd83dbSDimitry Andric PreservedAnalyses VectorCombinePass::run(Function &F,
26175ffd83dbSDimitry Andric                                          FunctionAnalysisManager &FAM) {
2618fe6060f1SDimitry Andric   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
26195ffd83dbSDimitry Andric   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
26205ffd83dbSDimitry Andric   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
2621fe6060f1SDimitry Andric   AAResults &AA = FAM.getResult<AAManager>(F);
26220fca6ea1SDimitry Andric   const DataLayout *DL = &F.getDataLayout();
26230fca6ea1SDimitry Andric   VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
26245ffd83dbSDimitry Andric   if (!Combiner.run())
26255ffd83dbSDimitry Andric     return PreservedAnalyses::all();
26265ffd83dbSDimitry Andric   PreservedAnalyses PA;
26275ffd83dbSDimitry Andric   PA.preserveSet<CFGAnalyses>();
26285ffd83dbSDimitry Andric   return PA;
26295ffd83dbSDimitry Andric }
2630