15ffd83dbSDimitry Andric //===------- VectorCombine.cpp - Optimize partial vector operations -------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass optimizes scalar/vector interactions using target cost models. The 105ffd83dbSDimitry Andric // transforms implemented here may not fit in traditional loop-based or SLP 115ffd83dbSDimitry Andric // vectorization passes. 125ffd83dbSDimitry Andric // 135ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 145ffd83dbSDimitry Andric 155ffd83dbSDimitry Andric #include "llvm/Transforms/Vectorize/VectorCombine.h" 165f757f3fSDimitry Andric #include "llvm/ADT/DenseMap.h" 170fca6ea1SDimitry Andric #include "llvm/ADT/STLExtras.h" 185f757f3fSDimitry Andric #include "llvm/ADT/ScopeExit.h" 195ffd83dbSDimitry Andric #include "llvm/ADT/Statistic.h" 20fe6060f1SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 215ffd83dbSDimitry Andric #include "llvm/Analysis/BasicAliasAnalysis.h" 225ffd83dbSDimitry Andric #include "llvm/Analysis/GlobalsModRef.h" 23e8d8bef9SDimitry Andric #include "llvm/Analysis/Loads.h" 245ffd83dbSDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 255ffd83dbSDimitry Andric #include "llvm/Analysis/ValueTracking.h" 265ffd83dbSDimitry Andric #include "llvm/Analysis/VectorUtils.h" 275ffd83dbSDimitry Andric #include "llvm/IR/Dominators.h" 285ffd83dbSDimitry Andric #include "llvm/IR/Function.h" 295ffd83dbSDimitry Andric #include "llvm/IR/IRBuilder.h" 305ffd83dbSDimitry Andric #include "llvm/IR/PatternMatch.h" 315ffd83dbSDimitry Andric #include "llvm/Support/CommandLine.h" 325ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/Local.h" 330fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h" 34bdd1243dSDimitry Andric #include <numeric> 355f757f3fSDimitry Andric #include <queue> 365ffd83dbSDimitry Andric 37349cc55cSDimitry Andric #define DEBUG_TYPE "vector-combine" 38349cc55cSDimitry Andric #include "llvm/Transforms/Utils/InstructionWorklist.h" 39349cc55cSDimitry Andric 405ffd83dbSDimitry Andric using namespace llvm; 415ffd83dbSDimitry Andric using namespace llvm::PatternMatch; 425ffd83dbSDimitry Andric 43e8d8bef9SDimitry Andric STATISTIC(NumVecLoad, "Number of vector loads formed"); 445ffd83dbSDimitry Andric STATISTIC(NumVecCmp, "Number of vector compares formed"); 455ffd83dbSDimitry Andric STATISTIC(NumVecBO, "Number of vector binops formed"); 465ffd83dbSDimitry Andric STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 475ffd83dbSDimitry Andric STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 485ffd83dbSDimitry Andric STATISTIC(NumScalarBO, "Number of scalar binops formed"); 495ffd83dbSDimitry Andric STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 505ffd83dbSDimitry Andric 515ffd83dbSDimitry Andric static cl::opt<bool> DisableVectorCombine( 525ffd83dbSDimitry Andric "disable-vector-combine", cl::init(false), cl::Hidden, 535ffd83dbSDimitry Andric cl::desc("Disable all vector combine transforms")); 545ffd83dbSDimitry Andric 555ffd83dbSDimitry Andric static cl::opt<bool> DisableBinopExtractShuffle( 565ffd83dbSDimitry Andric "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 575ffd83dbSDimitry Andric cl::desc("Disable binop extract to shuffle transforms")); 585ffd83dbSDimitry Andric 59fe6060f1SDimitry Andric static cl::opt<unsigned> MaxInstrsToScan( 60fe6060f1SDimitry Andric "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, 61fe6060f1SDimitry Andric cl::desc("Max number of instructions to scan for vector combining.")); 62fe6060f1SDimitry Andric 635ffd83dbSDimitry Andric static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 645ffd83dbSDimitry Andric 655ffd83dbSDimitry Andric namespace { 665ffd83dbSDimitry Andric class VectorCombine { 675ffd83dbSDimitry Andric public: 685ffd83dbSDimitry Andric VectorCombine(Function &F, const TargetTransformInfo &TTI, 69349cc55cSDimitry Andric const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, 700fca6ea1SDimitry Andric const DataLayout *DL, bool TryEarlyFoldsOnly) 710fca6ea1SDimitry Andric : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL), 72bdd1243dSDimitry Andric TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} 735ffd83dbSDimitry Andric 745ffd83dbSDimitry Andric bool run(); 755ffd83dbSDimitry Andric 765ffd83dbSDimitry Andric private: 775ffd83dbSDimitry Andric Function &F; 785ffd83dbSDimitry Andric IRBuilder<> Builder; 795ffd83dbSDimitry Andric const TargetTransformInfo &TTI; 805ffd83dbSDimitry Andric const DominatorTree &DT; 81fe6060f1SDimitry Andric AAResults &AA; 82fe6060f1SDimitry Andric AssumptionCache &AC; 830fca6ea1SDimitry Andric const DataLayout *DL; 845ffd83dbSDimitry Andric 85bdd1243dSDimitry Andric /// If true, only perform beneficial early IR transforms. Do not introduce new 86349cc55cSDimitry Andric /// vector operations. 87bdd1243dSDimitry Andric bool TryEarlyFoldsOnly; 88349cc55cSDimitry Andric 89349cc55cSDimitry Andric InstructionWorklist Worklist; 90349cc55cSDimitry Andric 91bdd1243dSDimitry Andric // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" 92bdd1243dSDimitry Andric // parameter. That should be updated to specific sub-classes because the 93bdd1243dSDimitry Andric // run loop was changed to dispatch on opcode. 94e8d8bef9SDimitry Andric bool vectorizeLoadInsert(Instruction &I); 95bdd1243dSDimitry Andric bool widenSubvectorLoad(Instruction &I); 965ffd83dbSDimitry Andric ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 975ffd83dbSDimitry Andric ExtractElementInst *Ext1, 985ffd83dbSDimitry Andric unsigned PreferredExtractIndex) const; 995ffd83dbSDimitry Andric bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 100349cc55cSDimitry Andric const Instruction &I, 1015ffd83dbSDimitry Andric ExtractElementInst *&ConvertToShuffle, 1025ffd83dbSDimitry Andric unsigned PreferredExtractIndex); 1035ffd83dbSDimitry Andric void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 1045ffd83dbSDimitry Andric Instruction &I); 1055ffd83dbSDimitry Andric void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 1065ffd83dbSDimitry Andric Instruction &I); 1075ffd83dbSDimitry Andric bool foldExtractExtract(Instruction &I); 108bdd1243dSDimitry Andric bool foldInsExtFNeg(Instruction &I); 1095f757f3fSDimitry Andric bool foldBitcastShuffle(Instruction &I); 1105ffd83dbSDimitry Andric bool scalarizeBinopOrCmp(Instruction &I); 1115f757f3fSDimitry Andric bool scalarizeVPIntrinsic(Instruction &I); 1125ffd83dbSDimitry Andric bool foldExtractedCmps(Instruction &I); 113fe6060f1SDimitry Andric bool foldSingleElementStore(Instruction &I); 114fe6060f1SDimitry Andric bool scalarizeLoadExtract(Instruction &I); 115349cc55cSDimitry Andric bool foldShuffleOfBinops(Instruction &I); 1160fca6ea1SDimitry Andric bool foldShuffleOfCastops(Instruction &I); 1170fca6ea1SDimitry Andric bool foldShuffleOfShuffles(Instruction &I); 1180fca6ea1SDimitry Andric bool foldShuffleToIdentity(Instruction &I); 11981ad6265SDimitry Andric bool foldShuffleFromReductions(Instruction &I); 1200fca6ea1SDimitry Andric bool foldCastFromReductions(Instruction &I); 12181ad6265SDimitry Andric bool foldSelectShuffle(Instruction &I, bool FromReduction = false); 1225ffd83dbSDimitry Andric 123349cc55cSDimitry Andric void replaceValue(Value &Old, Value &New) { 1245ffd83dbSDimitry Andric Old.replaceAllUsesWith(&New); 125349cc55cSDimitry Andric if (auto *NewI = dyn_cast<Instruction>(&New)) { 12681ad6265SDimitry Andric New.takeName(&Old); 127349cc55cSDimitry Andric Worklist.pushUsersToWorkList(*NewI); 128349cc55cSDimitry Andric Worklist.pushValue(NewI); 1295ffd83dbSDimitry Andric } 130349cc55cSDimitry Andric Worklist.pushValue(&Old); 131349cc55cSDimitry Andric } 132349cc55cSDimitry Andric 133349cc55cSDimitry Andric void eraseInstruction(Instruction &I) { 134349cc55cSDimitry Andric for (Value *Op : I.operands()) 135349cc55cSDimitry Andric Worklist.pushValue(Op); 136349cc55cSDimitry Andric Worklist.remove(&I); 137349cc55cSDimitry Andric I.eraseFromParent(); 138349cc55cSDimitry Andric } 139349cc55cSDimitry Andric }; 140349cc55cSDimitry Andric } // namespace 1415ffd83dbSDimitry Andric 1420fca6ea1SDimitry Andric /// Return the source operand of a potentially bitcasted value. If there is no 1430fca6ea1SDimitry Andric /// bitcast, return the input value itself. 1440fca6ea1SDimitry Andric static Value *peekThroughBitcasts(Value *V) { 1450fca6ea1SDimitry Andric while (auto *BitCast = dyn_cast<BitCastInst>(V)) 1460fca6ea1SDimitry Andric V = BitCast->getOperand(0); 1470fca6ea1SDimitry Andric return V; 1480fca6ea1SDimitry Andric } 1490fca6ea1SDimitry Andric 150bdd1243dSDimitry Andric static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { 151bdd1243dSDimitry Andric // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. 152bdd1243dSDimitry Andric // The widened load may load data from dirty regions or create data races 153bdd1243dSDimitry Andric // non-existent in the source. 154bdd1243dSDimitry Andric if (!Load || !Load->isSimple() || !Load->hasOneUse() || 155bdd1243dSDimitry Andric Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 156bdd1243dSDimitry Andric mustSuppressSpeculation(*Load)) 157bdd1243dSDimitry Andric return false; 158bdd1243dSDimitry Andric 159bdd1243dSDimitry Andric // We are potentially transforming byte-sized (8-bit) memory accesses, so make 160bdd1243dSDimitry Andric // sure we have all of our type-based constraints in place for this target. 161bdd1243dSDimitry Andric Type *ScalarTy = Load->getType()->getScalarType(); 162bdd1243dSDimitry Andric uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 163bdd1243dSDimitry Andric unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 164bdd1243dSDimitry Andric if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 165bdd1243dSDimitry Andric ScalarSize % 8 != 0) 166bdd1243dSDimitry Andric return false; 167bdd1243dSDimitry Andric 168bdd1243dSDimitry Andric return true; 169bdd1243dSDimitry Andric } 170bdd1243dSDimitry Andric 171e8d8bef9SDimitry Andric bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 172e8d8bef9SDimitry Andric // Match insert into fixed vector of scalar value. 173e8d8bef9SDimitry Andric // TODO: Handle non-zero insert index. 174e8d8bef9SDimitry Andric Value *Scalar; 175bdd1243dSDimitry Andric if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || 176e8d8bef9SDimitry Andric !Scalar->hasOneUse()) 177e8d8bef9SDimitry Andric return false; 178e8d8bef9SDimitry Andric 179e8d8bef9SDimitry Andric // Optionally match an extract from another vector. 180e8d8bef9SDimitry Andric Value *X; 181e8d8bef9SDimitry Andric bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 182e8d8bef9SDimitry Andric if (!HasExtract) 183e8d8bef9SDimitry Andric X = Scalar; 184e8d8bef9SDimitry Andric 185e8d8bef9SDimitry Andric auto *Load = dyn_cast<LoadInst>(X); 186bdd1243dSDimitry Andric if (!canWidenLoad(Load, TTI)) 187e8d8bef9SDimitry Andric return false; 188e8d8bef9SDimitry Andric 189e8d8bef9SDimitry Andric Type *ScalarTy = Scalar->getType(); 190e8d8bef9SDimitry Andric uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 191e8d8bef9SDimitry Andric unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 192e8d8bef9SDimitry Andric 193e8d8bef9SDimitry Andric // Check safety of replacing the scalar load with a larger vector load. 194e8d8bef9SDimitry Andric // We use minimal alignment (maximum flexibility) because we only care about 195e8d8bef9SDimitry Andric // the dereferenceable region. When calculating cost and creating a new op, 196e8d8bef9SDimitry Andric // we may use a larger value based on alignment attributes. 197bdd1243dSDimitry Andric Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 198bdd1243dSDimitry Andric assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 199bdd1243dSDimitry Andric 200e8d8bef9SDimitry Andric unsigned MinVecNumElts = MinVectorSize / ScalarSize; 201e8d8bef9SDimitry Andric auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 202e8d8bef9SDimitry Andric unsigned OffsetEltIndex = 0; 203e8d8bef9SDimitry Andric Align Alignment = Load->getAlign(); 2040fca6ea1SDimitry Andric if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 205bdd1243dSDimitry Andric &DT)) { 206e8d8bef9SDimitry Andric // It is not safe to load directly from the pointer, but we can still peek 207e8d8bef9SDimitry Andric // through gep offsets and check if it safe to load from a base address with 208e8d8bef9SDimitry Andric // updated alignment. If it is, we can shuffle the element(s) into place 209e8d8bef9SDimitry Andric // after loading. 2100fca6ea1SDimitry Andric unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType()); 211e8d8bef9SDimitry Andric APInt Offset(OffsetBitWidth, 0); 2120fca6ea1SDimitry Andric SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); 213e8d8bef9SDimitry Andric 214e8d8bef9SDimitry Andric // We want to shuffle the result down from a high element of a vector, so 215e8d8bef9SDimitry Andric // the offset must be positive. 216e8d8bef9SDimitry Andric if (Offset.isNegative()) 217e8d8bef9SDimitry Andric return false; 218e8d8bef9SDimitry Andric 219e8d8bef9SDimitry Andric // The offset must be a multiple of the scalar element to shuffle cleanly 220e8d8bef9SDimitry Andric // in the element's size. 221e8d8bef9SDimitry Andric uint64_t ScalarSizeInBytes = ScalarSize / 8; 222e8d8bef9SDimitry Andric if (Offset.urem(ScalarSizeInBytes) != 0) 223e8d8bef9SDimitry Andric return false; 224e8d8bef9SDimitry Andric 225e8d8bef9SDimitry Andric // If we load MinVecNumElts, will our target element still be loaded? 226e8d8bef9SDimitry Andric OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 227e8d8bef9SDimitry Andric if (OffsetEltIndex >= MinVecNumElts) 228e8d8bef9SDimitry Andric return false; 229e8d8bef9SDimitry Andric 2300fca6ea1SDimitry Andric if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 231bdd1243dSDimitry Andric &DT)) 232e8d8bef9SDimitry Andric return false; 233e8d8bef9SDimitry Andric 234e8d8bef9SDimitry Andric // Update alignment with offset value. Note that the offset could be negated 235e8d8bef9SDimitry Andric // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 236e8d8bef9SDimitry Andric // negation does not change the result of the alignment calculation. 237e8d8bef9SDimitry Andric Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 238e8d8bef9SDimitry Andric } 239e8d8bef9SDimitry Andric 240e8d8bef9SDimitry Andric // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 241e8d8bef9SDimitry Andric // Use the greater of the alignment on the load or its source pointer. 2420fca6ea1SDimitry Andric Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 243e8d8bef9SDimitry Andric Type *LoadTy = Load->getType(); 244bdd1243dSDimitry Andric unsigned AS = Load->getPointerAddressSpace(); 245e8d8bef9SDimitry Andric InstructionCost OldCost = 246e8d8bef9SDimitry Andric TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 247e8d8bef9SDimitry Andric APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 248bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 249bdd1243dSDimitry Andric OldCost += 250bdd1243dSDimitry Andric TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 251bdd1243dSDimitry Andric /* Insert */ true, HasExtract, CostKind); 252e8d8bef9SDimitry Andric 253e8d8bef9SDimitry Andric // New pattern: load VecPtr 254e8d8bef9SDimitry Andric InstructionCost NewCost = 255e8d8bef9SDimitry Andric TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); 256e8d8bef9SDimitry Andric // Optionally, we are shuffling the loaded vector element(s) into place. 257fe6060f1SDimitry Andric // For the mask set everything but element 0 to undef to prevent poison from 258fe6060f1SDimitry Andric // propagating from the extra loaded memory. This will also optionally 259fe6060f1SDimitry Andric // shrink/grow the vector from the loaded size to the output size. 260fe6060f1SDimitry Andric // We assume this operation has no cost in codegen if there was no offset. 261fe6060f1SDimitry Andric // Note that we could use freeze to avoid poison problems, but then we might 262fe6060f1SDimitry Andric // still need a shuffle to change the vector size. 263bdd1243dSDimitry Andric auto *Ty = cast<FixedVectorType>(I.getType()); 264fe6060f1SDimitry Andric unsigned OutputNumElts = Ty->getNumElements(); 26506c3fb27SDimitry Andric SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); 266fe6060f1SDimitry Andric assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 267fe6060f1SDimitry Andric Mask[0] = OffsetEltIndex; 268e8d8bef9SDimitry Andric if (OffsetEltIndex) 269fe6060f1SDimitry Andric NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); 270e8d8bef9SDimitry Andric 271e8d8bef9SDimitry Andric // We can aggressively convert to the vector form because the backend can 272e8d8bef9SDimitry Andric // invert this transform if it does not result in a performance win. 273e8d8bef9SDimitry Andric if (OldCost < NewCost || !NewCost.isValid()) 274e8d8bef9SDimitry Andric return false; 275e8d8bef9SDimitry Andric 276e8d8bef9SDimitry Andric // It is safe and potentially profitable to load a vector directly: 277e8d8bef9SDimitry Andric // inselt undef, load Scalar, 0 --> load VecPtr 278e8d8bef9SDimitry Andric IRBuilder<> Builder(Load); 2795f757f3fSDimitry Andric Value *CastedPtr = 2805f757f3fSDimitry Andric Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 281e8d8bef9SDimitry Andric Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 282e8d8bef9SDimitry Andric VecLd = Builder.CreateShuffleVector(VecLd, Mask); 283e8d8bef9SDimitry Andric 284e8d8bef9SDimitry Andric replaceValue(I, *VecLd); 285e8d8bef9SDimitry Andric ++NumVecLoad; 286e8d8bef9SDimitry Andric return true; 287e8d8bef9SDimitry Andric } 288e8d8bef9SDimitry Andric 289bdd1243dSDimitry Andric /// If we are loading a vector and then inserting it into a larger vector with 290bdd1243dSDimitry Andric /// undefined elements, try to load the larger vector and eliminate the insert. 291bdd1243dSDimitry Andric /// This removes a shuffle in IR and may allow combining of other loaded values. 292bdd1243dSDimitry Andric bool VectorCombine::widenSubvectorLoad(Instruction &I) { 293bdd1243dSDimitry Andric // Match subvector insert of fixed vector. 294bdd1243dSDimitry Andric auto *Shuf = cast<ShuffleVectorInst>(&I); 295bdd1243dSDimitry Andric if (!Shuf->isIdentityWithPadding()) 296bdd1243dSDimitry Andric return false; 297bdd1243dSDimitry Andric 298bdd1243dSDimitry Andric // Allow a non-canonical shuffle mask that is choosing elements from op1. 299bdd1243dSDimitry Andric unsigned NumOpElts = 300bdd1243dSDimitry Andric cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); 301bdd1243dSDimitry Andric unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { 302bdd1243dSDimitry Andric return M >= (int)(NumOpElts); 303bdd1243dSDimitry Andric }); 304bdd1243dSDimitry Andric 305bdd1243dSDimitry Andric auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); 306bdd1243dSDimitry Andric if (!canWidenLoad(Load, TTI)) 307bdd1243dSDimitry Andric return false; 308bdd1243dSDimitry Andric 309bdd1243dSDimitry Andric // We use minimal alignment (maximum flexibility) because we only care about 310bdd1243dSDimitry Andric // the dereferenceable region. When calculating cost and creating a new op, 311bdd1243dSDimitry Andric // we may use a larger value based on alignment attributes. 312bdd1243dSDimitry Andric auto *Ty = cast<FixedVectorType>(I.getType()); 313bdd1243dSDimitry Andric Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 314bdd1243dSDimitry Andric assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 315bdd1243dSDimitry Andric Align Alignment = Load->getAlign(); 3160fca6ea1SDimitry Andric if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT)) 317bdd1243dSDimitry Andric return false; 318bdd1243dSDimitry Andric 3190fca6ea1SDimitry Andric Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 320bdd1243dSDimitry Andric Type *LoadTy = Load->getType(); 321bdd1243dSDimitry Andric unsigned AS = Load->getPointerAddressSpace(); 322bdd1243dSDimitry Andric 323bdd1243dSDimitry Andric // Original pattern: insert_subvector (load PtrOp) 324bdd1243dSDimitry Andric // This conservatively assumes that the cost of a subvector insert into an 325bdd1243dSDimitry Andric // undef value is 0. We could add that cost if the cost model accurately 326bdd1243dSDimitry Andric // reflects the real cost of that operation. 327bdd1243dSDimitry Andric InstructionCost OldCost = 328bdd1243dSDimitry Andric TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 329bdd1243dSDimitry Andric 330bdd1243dSDimitry Andric // New pattern: load PtrOp 331bdd1243dSDimitry Andric InstructionCost NewCost = 332bdd1243dSDimitry Andric TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); 333bdd1243dSDimitry Andric 334bdd1243dSDimitry Andric // We can aggressively convert to the vector form because the backend can 335bdd1243dSDimitry Andric // invert this transform if it does not result in a performance win. 336bdd1243dSDimitry Andric if (OldCost < NewCost || !NewCost.isValid()) 337bdd1243dSDimitry Andric return false; 338bdd1243dSDimitry Andric 339bdd1243dSDimitry Andric IRBuilder<> Builder(Load); 340bdd1243dSDimitry Andric Value *CastedPtr = 3415f757f3fSDimitry Andric Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 342bdd1243dSDimitry Andric Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); 343bdd1243dSDimitry Andric replaceValue(I, *VecLd); 344bdd1243dSDimitry Andric ++NumVecLoad; 345bdd1243dSDimitry Andric return true; 346bdd1243dSDimitry Andric } 347bdd1243dSDimitry Andric 3485ffd83dbSDimitry Andric /// Determine which, if any, of the inputs should be replaced by a shuffle 3495ffd83dbSDimitry Andric /// followed by extract from a different index. 3505ffd83dbSDimitry Andric ExtractElementInst *VectorCombine::getShuffleExtract( 3515ffd83dbSDimitry Andric ExtractElementInst *Ext0, ExtractElementInst *Ext1, 3525ffd83dbSDimitry Andric unsigned PreferredExtractIndex = InvalidIndex) const { 35381ad6265SDimitry Andric auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 35481ad6265SDimitry Andric auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 35581ad6265SDimitry Andric assert(Index0C && Index1C && "Expected constant extract indexes"); 3565ffd83dbSDimitry Andric 35781ad6265SDimitry Andric unsigned Index0 = Index0C->getZExtValue(); 35881ad6265SDimitry Andric unsigned Index1 = Index1C->getZExtValue(); 3595ffd83dbSDimitry Andric 3605ffd83dbSDimitry Andric // If the extract indexes are identical, no shuffle is needed. 3615ffd83dbSDimitry Andric if (Index0 == Index1) 3625ffd83dbSDimitry Andric return nullptr; 3635ffd83dbSDimitry Andric 3645ffd83dbSDimitry Andric Type *VecTy = Ext0->getVectorOperand()->getType(); 365bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3665ffd83dbSDimitry Andric assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 367e8d8bef9SDimitry Andric InstructionCost Cost0 = 368bdd1243dSDimitry Andric TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 369e8d8bef9SDimitry Andric InstructionCost Cost1 = 370bdd1243dSDimitry Andric TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 371e8d8bef9SDimitry Andric 372e8d8bef9SDimitry Andric // If both costs are invalid no shuffle is needed 373e8d8bef9SDimitry Andric if (!Cost0.isValid() && !Cost1.isValid()) 374e8d8bef9SDimitry Andric return nullptr; 3755ffd83dbSDimitry Andric 3765ffd83dbSDimitry Andric // We are extracting from 2 different indexes, so one operand must be shuffled 3775ffd83dbSDimitry Andric // before performing a vector operation and/or extract. The more expensive 3785ffd83dbSDimitry Andric // extract will be replaced by a shuffle. 3795ffd83dbSDimitry Andric if (Cost0 > Cost1) 3805ffd83dbSDimitry Andric return Ext0; 3815ffd83dbSDimitry Andric if (Cost1 > Cost0) 3825ffd83dbSDimitry Andric return Ext1; 3835ffd83dbSDimitry Andric 3845ffd83dbSDimitry Andric // If the costs are equal and there is a preferred extract index, shuffle the 3855ffd83dbSDimitry Andric // opposite operand. 3865ffd83dbSDimitry Andric if (PreferredExtractIndex == Index0) 3875ffd83dbSDimitry Andric return Ext1; 3885ffd83dbSDimitry Andric if (PreferredExtractIndex == Index1) 3895ffd83dbSDimitry Andric return Ext0; 3905ffd83dbSDimitry Andric 3915ffd83dbSDimitry Andric // Otherwise, replace the extract with the higher index. 3925ffd83dbSDimitry Andric return Index0 > Index1 ? Ext0 : Ext1; 3935ffd83dbSDimitry Andric } 3945ffd83dbSDimitry Andric 3955ffd83dbSDimitry Andric /// Compare the relative costs of 2 extracts followed by scalar operation vs. 3965ffd83dbSDimitry Andric /// vector operation(s) followed by extract. Return true if the existing 3975ffd83dbSDimitry Andric /// instructions are cheaper than a vector alternative. Otherwise, return false 3985ffd83dbSDimitry Andric /// and if one of the extracts should be transformed to a shufflevector, set 3995ffd83dbSDimitry Andric /// \p ConvertToShuffle to that extract instruction. 4005ffd83dbSDimitry Andric bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 4015ffd83dbSDimitry Andric ExtractElementInst *Ext1, 402349cc55cSDimitry Andric const Instruction &I, 4035ffd83dbSDimitry Andric ExtractElementInst *&ConvertToShuffle, 4045ffd83dbSDimitry Andric unsigned PreferredExtractIndex) { 40581ad6265SDimitry Andric auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); 40681ad6265SDimitry Andric auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); 40781ad6265SDimitry Andric assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); 40881ad6265SDimitry Andric 409349cc55cSDimitry Andric unsigned Opcode = I.getOpcode(); 4105ffd83dbSDimitry Andric Type *ScalarTy = Ext0->getType(); 4115ffd83dbSDimitry Andric auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); 412e8d8bef9SDimitry Andric InstructionCost ScalarOpCost, VectorOpCost; 4135ffd83dbSDimitry Andric 4145ffd83dbSDimitry Andric // Get cost estimates for scalar and vector versions of the operation. 4155ffd83dbSDimitry Andric bool IsBinOp = Instruction::isBinaryOp(Opcode); 4165ffd83dbSDimitry Andric if (IsBinOp) { 4175ffd83dbSDimitry Andric ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 4185ffd83dbSDimitry Andric VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 4195ffd83dbSDimitry Andric } else { 4205ffd83dbSDimitry Andric assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 4215ffd83dbSDimitry Andric "Expected a compare"); 422349cc55cSDimitry Andric CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 423349cc55cSDimitry Andric ScalarOpCost = TTI.getCmpSelInstrCost( 424349cc55cSDimitry Andric Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 425349cc55cSDimitry Andric VectorOpCost = TTI.getCmpSelInstrCost( 426349cc55cSDimitry Andric Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 4275ffd83dbSDimitry Andric } 4285ffd83dbSDimitry Andric 4295ffd83dbSDimitry Andric // Get cost estimates for the extract elements. These costs will factor into 4305ffd83dbSDimitry Andric // both sequences. 43181ad6265SDimitry Andric unsigned Ext0Index = Ext0IndexC->getZExtValue(); 43281ad6265SDimitry Andric unsigned Ext1Index = Ext1IndexC->getZExtValue(); 433bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4345ffd83dbSDimitry Andric 435e8d8bef9SDimitry Andric InstructionCost Extract0Cost = 436bdd1243dSDimitry Andric TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); 437e8d8bef9SDimitry Andric InstructionCost Extract1Cost = 438bdd1243dSDimitry Andric TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); 4395ffd83dbSDimitry Andric 4405ffd83dbSDimitry Andric // A more expensive extract will always be replaced by a splat shuffle. 4415ffd83dbSDimitry Andric // For example, if Ext0 is more expensive: 4425ffd83dbSDimitry Andric // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 4435ffd83dbSDimitry Andric // extelt (opcode (splat V0, Ext0), V1), Ext1 4445ffd83dbSDimitry Andric // TODO: Evaluate whether that always results in lowest cost. Alternatively, 4455ffd83dbSDimitry Andric // check the cost of creating a broadcast shuffle and shuffling both 4465ffd83dbSDimitry Andric // operands to element 0. 447e8d8bef9SDimitry Andric InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 4485ffd83dbSDimitry Andric 4495ffd83dbSDimitry Andric // Extra uses of the extracts mean that we include those costs in the 4505ffd83dbSDimitry Andric // vector total because those instructions will not be eliminated. 451e8d8bef9SDimitry Andric InstructionCost OldCost, NewCost; 4525ffd83dbSDimitry Andric if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { 4535ffd83dbSDimitry Andric // Handle a special case. If the 2 extracts are identical, adjust the 4545ffd83dbSDimitry Andric // formulas to account for that. The extra use charge allows for either the 4555ffd83dbSDimitry Andric // CSE'd pattern or an unoptimized form with identical values: 4565ffd83dbSDimitry Andric // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 4575ffd83dbSDimitry Andric bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 4585ffd83dbSDimitry Andric : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 4595ffd83dbSDimitry Andric OldCost = CheapExtractCost + ScalarOpCost; 4605ffd83dbSDimitry Andric NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 4615ffd83dbSDimitry Andric } else { 4625ffd83dbSDimitry Andric // Handle the general case. Each extract is actually a different value: 4635ffd83dbSDimitry Andric // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 4645ffd83dbSDimitry Andric OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 4655ffd83dbSDimitry Andric NewCost = VectorOpCost + CheapExtractCost + 4665ffd83dbSDimitry Andric !Ext0->hasOneUse() * Extract0Cost + 4675ffd83dbSDimitry Andric !Ext1->hasOneUse() * Extract1Cost; 4685ffd83dbSDimitry Andric } 4695ffd83dbSDimitry Andric 4705ffd83dbSDimitry Andric ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 4715ffd83dbSDimitry Andric if (ConvertToShuffle) { 4725ffd83dbSDimitry Andric if (IsBinOp && DisableBinopExtractShuffle) 4735ffd83dbSDimitry Andric return true; 4745ffd83dbSDimitry Andric 4755ffd83dbSDimitry Andric // If we are extracting from 2 different indexes, then one operand must be 4765ffd83dbSDimitry Andric // shuffled before performing the vector operation. The shuffle mask is 47706c3fb27SDimitry Andric // poison except for 1 lane that is being translated to the remaining 4785ffd83dbSDimitry Andric // extraction lane. Therefore, it is a splat shuffle. Ex: 47906c3fb27SDimitry Andric // ShufMask = { poison, poison, 0, poison } 4805ffd83dbSDimitry Andric // TODO: The cost model has an option for a "broadcast" shuffle 4815ffd83dbSDimitry Andric // (splat-from-element-0), but no option for a more general splat. 4825ffd83dbSDimitry Andric NewCost += 4835ffd83dbSDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 4845ffd83dbSDimitry Andric } 4855ffd83dbSDimitry Andric 4865ffd83dbSDimitry Andric // Aggressively form a vector op if the cost is equal because the transform 4875ffd83dbSDimitry Andric // may enable further optimization. 4885ffd83dbSDimitry Andric // Codegen can reverse this transform (scalarize) if it was not profitable. 4895ffd83dbSDimitry Andric return OldCost < NewCost; 4905ffd83dbSDimitry Andric } 4915ffd83dbSDimitry Andric 4925ffd83dbSDimitry Andric /// Create a shuffle that translates (shifts) 1 element from the input vector 4935ffd83dbSDimitry Andric /// to a new element location. 4945ffd83dbSDimitry Andric static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 4955ffd83dbSDimitry Andric unsigned NewIndex, IRBuilder<> &Builder) { 49606c3fb27SDimitry Andric // The shuffle mask is poison except for 1 lane that is being translated 4975ffd83dbSDimitry Andric // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 49806c3fb27SDimitry Andric // ShufMask = { 2, poison, poison, poison } 4995ffd83dbSDimitry Andric auto *VecTy = cast<FixedVectorType>(Vec->getType()); 50006c3fb27SDimitry Andric SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 5015ffd83dbSDimitry Andric ShufMask[NewIndex] = OldIndex; 502e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 5035ffd83dbSDimitry Andric } 5045ffd83dbSDimitry Andric 5055ffd83dbSDimitry Andric /// Given an extract element instruction with constant index operand, shuffle 5065ffd83dbSDimitry Andric /// the source vector (shift the scalar element) to a NewIndex for extraction. 5075ffd83dbSDimitry Andric /// Return null if the input can be constant folded, so that we are not creating 5085ffd83dbSDimitry Andric /// unnecessary instructions. 5095ffd83dbSDimitry Andric static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 5105ffd83dbSDimitry Andric unsigned NewIndex, 5115ffd83dbSDimitry Andric IRBuilder<> &Builder) { 512753f127fSDimitry Andric // Shufflevectors can only be created for fixed-width vectors. 513753f127fSDimitry Andric if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) 514753f127fSDimitry Andric return nullptr; 515753f127fSDimitry Andric 5165ffd83dbSDimitry Andric // If the extract can be constant-folded, this code is unsimplified. Defer 5175ffd83dbSDimitry Andric // to other passes to handle that. 5185ffd83dbSDimitry Andric Value *X = ExtElt->getVectorOperand(); 5195ffd83dbSDimitry Andric Value *C = ExtElt->getIndexOperand(); 5205ffd83dbSDimitry Andric assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 5215ffd83dbSDimitry Andric if (isa<Constant>(X)) 5225ffd83dbSDimitry Andric return nullptr; 5235ffd83dbSDimitry Andric 5245ffd83dbSDimitry Andric Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 5255ffd83dbSDimitry Andric NewIndex, Builder); 5265ffd83dbSDimitry Andric return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 5275ffd83dbSDimitry Andric } 5285ffd83dbSDimitry Andric 5295ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar compares to vector 5305ffd83dbSDimitry Andric /// compares followed by extract. 5315ffd83dbSDimitry Andric /// cmp (ext0 V0, C), (ext1 V1, C) 5325ffd83dbSDimitry Andric void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 5335ffd83dbSDimitry Andric ExtractElementInst *Ext1, Instruction &I) { 5345ffd83dbSDimitry Andric assert(isa<CmpInst>(&I) && "Expected a compare"); 5355ffd83dbSDimitry Andric assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 5365ffd83dbSDimitry Andric cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 5375ffd83dbSDimitry Andric "Expected matching constant extract indexes"); 5385ffd83dbSDimitry Andric 5395ffd83dbSDimitry Andric // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 5405ffd83dbSDimitry Andric ++NumVecCmp; 5415ffd83dbSDimitry Andric CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 5425ffd83dbSDimitry Andric Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 5435ffd83dbSDimitry Andric Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 5445ffd83dbSDimitry Andric Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 5455ffd83dbSDimitry Andric replaceValue(I, *NewExt); 5465ffd83dbSDimitry Andric } 5475ffd83dbSDimitry Andric 5485ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar binops to vector 5495ffd83dbSDimitry Andric /// binops followed by extract. 5505ffd83dbSDimitry Andric /// bo (ext0 V0, C), (ext1 V1, C) 5515ffd83dbSDimitry Andric void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 5525ffd83dbSDimitry Andric ExtractElementInst *Ext1, Instruction &I) { 5535ffd83dbSDimitry Andric assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 5545ffd83dbSDimitry Andric assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 5555ffd83dbSDimitry Andric cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 5565ffd83dbSDimitry Andric "Expected matching constant extract indexes"); 5575ffd83dbSDimitry Andric 5585ffd83dbSDimitry Andric // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 5595ffd83dbSDimitry Andric ++NumVecBO; 5605ffd83dbSDimitry Andric Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 5615ffd83dbSDimitry Andric Value *VecBO = 5625ffd83dbSDimitry Andric Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 5635ffd83dbSDimitry Andric 5645ffd83dbSDimitry Andric // All IR flags are safe to back-propagate because any potential poison 5655ffd83dbSDimitry Andric // created in unused vector elements is discarded by the extract. 5665ffd83dbSDimitry Andric if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 5675ffd83dbSDimitry Andric VecBOInst->copyIRFlags(&I); 5685ffd83dbSDimitry Andric 5695ffd83dbSDimitry Andric Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 5705ffd83dbSDimitry Andric replaceValue(I, *NewExt); 5715ffd83dbSDimitry Andric } 5725ffd83dbSDimitry Andric 5735ffd83dbSDimitry Andric /// Match an instruction with extracted vector operands. 5745ffd83dbSDimitry Andric bool VectorCombine::foldExtractExtract(Instruction &I) { 5755ffd83dbSDimitry Andric // It is not safe to transform things like div, urem, etc. because we may 5765ffd83dbSDimitry Andric // create undefined behavior when executing those on unknown vector elements. 5775ffd83dbSDimitry Andric if (!isSafeToSpeculativelyExecute(&I)) 5785ffd83dbSDimitry Andric return false; 5795ffd83dbSDimitry Andric 5805ffd83dbSDimitry Andric Instruction *I0, *I1; 5815ffd83dbSDimitry Andric CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 5825ffd83dbSDimitry Andric if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 5835ffd83dbSDimitry Andric !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 5845ffd83dbSDimitry Andric return false; 5855ffd83dbSDimitry Andric 5865ffd83dbSDimitry Andric Value *V0, *V1; 5875ffd83dbSDimitry Andric uint64_t C0, C1; 5885ffd83dbSDimitry Andric if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 5895ffd83dbSDimitry Andric !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 5905ffd83dbSDimitry Andric V0->getType() != V1->getType()) 5915ffd83dbSDimitry Andric return false; 5925ffd83dbSDimitry Andric 5935ffd83dbSDimitry Andric // If the scalar value 'I' is going to be re-inserted into a vector, then try 5945ffd83dbSDimitry Andric // to create an extract to that same element. The extract/insert can be 5955ffd83dbSDimitry Andric // reduced to a "select shuffle". 5965ffd83dbSDimitry Andric // TODO: If we add a larger pattern match that starts from an insert, this 5975ffd83dbSDimitry Andric // probably becomes unnecessary. 5985ffd83dbSDimitry Andric auto *Ext0 = cast<ExtractElementInst>(I0); 5995ffd83dbSDimitry Andric auto *Ext1 = cast<ExtractElementInst>(I1); 6005ffd83dbSDimitry Andric uint64_t InsertIndex = InvalidIndex; 6015ffd83dbSDimitry Andric if (I.hasOneUse()) 6025ffd83dbSDimitry Andric match(I.user_back(), 6035ffd83dbSDimitry Andric m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 6045ffd83dbSDimitry Andric 6055ffd83dbSDimitry Andric ExtractElementInst *ExtractToChange; 606349cc55cSDimitry Andric if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) 6075ffd83dbSDimitry Andric return false; 6085ffd83dbSDimitry Andric 6095ffd83dbSDimitry Andric if (ExtractToChange) { 6105ffd83dbSDimitry Andric unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 6115ffd83dbSDimitry Andric ExtractElementInst *NewExtract = 6125ffd83dbSDimitry Andric translateExtract(ExtractToChange, CheapExtractIdx, Builder); 6135ffd83dbSDimitry Andric if (!NewExtract) 6145ffd83dbSDimitry Andric return false; 6155ffd83dbSDimitry Andric if (ExtractToChange == Ext0) 6165ffd83dbSDimitry Andric Ext0 = NewExtract; 6175ffd83dbSDimitry Andric else 6185ffd83dbSDimitry Andric Ext1 = NewExtract; 6195ffd83dbSDimitry Andric } 6205ffd83dbSDimitry Andric 6215ffd83dbSDimitry Andric if (Pred != CmpInst::BAD_ICMP_PREDICATE) 6225ffd83dbSDimitry Andric foldExtExtCmp(Ext0, Ext1, I); 6235ffd83dbSDimitry Andric else 6245ffd83dbSDimitry Andric foldExtExtBinop(Ext0, Ext1, I); 6255ffd83dbSDimitry Andric 626349cc55cSDimitry Andric Worklist.push(Ext0); 627349cc55cSDimitry Andric Worklist.push(Ext1); 6285ffd83dbSDimitry Andric return true; 6295ffd83dbSDimitry Andric } 6305ffd83dbSDimitry Andric 631bdd1243dSDimitry Andric /// Try to replace an extract + scalar fneg + insert with a vector fneg + 632bdd1243dSDimitry Andric /// shuffle. 633bdd1243dSDimitry Andric bool VectorCombine::foldInsExtFNeg(Instruction &I) { 634bdd1243dSDimitry Andric // Match an insert (op (extract)) pattern. 635bdd1243dSDimitry Andric Value *DestVec; 636bdd1243dSDimitry Andric uint64_t Index; 637bdd1243dSDimitry Andric Instruction *FNeg; 638bdd1243dSDimitry Andric if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), 639bdd1243dSDimitry Andric m_ConstantInt(Index)))) 640bdd1243dSDimitry Andric return false; 641bdd1243dSDimitry Andric 642bdd1243dSDimitry Andric // Note: This handles the canonical fneg instruction and "fsub -0.0, X". 643bdd1243dSDimitry Andric Value *SrcVec; 644bdd1243dSDimitry Andric Instruction *Extract; 645bdd1243dSDimitry Andric if (!match(FNeg, m_FNeg(m_CombineAnd( 646bdd1243dSDimitry Andric m_Instruction(Extract), 647bdd1243dSDimitry Andric m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) 648bdd1243dSDimitry Andric return false; 649bdd1243dSDimitry Andric 650bdd1243dSDimitry Andric // TODO: We could handle this with a length-changing shuffle. 651bdd1243dSDimitry Andric auto *VecTy = cast<FixedVectorType>(I.getType()); 652bdd1243dSDimitry Andric if (SrcVec->getType() != VecTy) 653bdd1243dSDimitry Andric return false; 654bdd1243dSDimitry Andric 655bdd1243dSDimitry Andric // Ignore bogus insert/extract index. 656bdd1243dSDimitry Andric unsigned NumElts = VecTy->getNumElements(); 657bdd1243dSDimitry Andric if (Index >= NumElts) 658bdd1243dSDimitry Andric return false; 659bdd1243dSDimitry Andric 660bdd1243dSDimitry Andric // We are inserting the negated element into the same lane that we extracted 661bdd1243dSDimitry Andric // from. This is equivalent to a select-shuffle that chooses all but the 662bdd1243dSDimitry Andric // negated element from the destination vector. 663bdd1243dSDimitry Andric SmallVector<int> Mask(NumElts); 664bdd1243dSDimitry Andric std::iota(Mask.begin(), Mask.end(), 0); 665bdd1243dSDimitry Andric Mask[Index] = Index + NumElts; 666bdd1243dSDimitry Andric 667bdd1243dSDimitry Andric Type *ScalarTy = VecTy->getScalarType(); 668bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 669bdd1243dSDimitry Andric InstructionCost OldCost = 670bdd1243dSDimitry Andric TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + 671bdd1243dSDimitry Andric TTI.getVectorInstrCost(I, VecTy, CostKind, Index); 672bdd1243dSDimitry Andric 673bdd1243dSDimitry Andric // If the extract has one use, it will be eliminated, so count it in the 674bdd1243dSDimitry Andric // original cost. If it has more than one use, ignore the cost because it will 675bdd1243dSDimitry Andric // be the same before/after. 676bdd1243dSDimitry Andric if (Extract->hasOneUse()) 677bdd1243dSDimitry Andric OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); 678bdd1243dSDimitry Andric 679bdd1243dSDimitry Andric InstructionCost NewCost = 680bdd1243dSDimitry Andric TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + 681bdd1243dSDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); 682bdd1243dSDimitry Andric 683bdd1243dSDimitry Andric if (NewCost > OldCost) 684bdd1243dSDimitry Andric return false; 685bdd1243dSDimitry Andric 686bdd1243dSDimitry Andric // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> 687bdd1243dSDimitry Andric // shuffle DestVec, (fneg SrcVec), Mask 688bdd1243dSDimitry Andric Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); 689bdd1243dSDimitry Andric Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); 690bdd1243dSDimitry Andric replaceValue(I, *Shuf); 691bdd1243dSDimitry Andric return true; 692bdd1243dSDimitry Andric } 693bdd1243dSDimitry Andric 6945ffd83dbSDimitry Andric /// If this is a bitcast of a shuffle, try to bitcast the source vector to the 6955ffd83dbSDimitry Andric /// destination type followed by shuffle. This can enable further transforms by 6965ffd83dbSDimitry Andric /// moving bitcasts or shuffles together. 6975f757f3fSDimitry Andric bool VectorCombine::foldBitcastShuffle(Instruction &I) { 6980fca6ea1SDimitry Andric Value *V0, *V1; 6995ffd83dbSDimitry Andric ArrayRef<int> Mask; 7000fca6ea1SDimitry Andric if (!match(&I, m_BitCast(m_OneUse( 7010fca6ea1SDimitry Andric m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask)))))) 7025ffd83dbSDimitry Andric return false; 7035ffd83dbSDimitry Andric 704e8d8bef9SDimitry Andric // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 705e8d8bef9SDimitry Andric // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 706e8d8bef9SDimitry Andric // mask for scalable type is a splat or not. 7075f757f3fSDimitry Andric // 2) Disallow non-vector casts. 7085ffd83dbSDimitry Andric // TODO: We could allow any shuffle. 7095f757f3fSDimitry Andric auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); 7100fca6ea1SDimitry Andric auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType()); 7115f757f3fSDimitry Andric if (!DestTy || !SrcTy) 7125ffd83dbSDimitry Andric return false; 7135ffd83dbSDimitry Andric 7145f757f3fSDimitry Andric unsigned DestEltSize = DestTy->getScalarSizeInBits(); 7155f757f3fSDimitry Andric unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); 7165f757f3fSDimitry Andric if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) 7175f757f3fSDimitry Andric return false; 7185f757f3fSDimitry Andric 7190fca6ea1SDimitry Andric bool IsUnary = isa<UndefValue>(V1); 7200fca6ea1SDimitry Andric 7210fca6ea1SDimitry Andric // For binary shuffles, only fold bitcast(shuffle(X,Y)) 7220fca6ea1SDimitry Andric // if it won't increase the number of bitcasts. 7230fca6ea1SDimitry Andric if (!IsUnary) { 7240fca6ea1SDimitry Andric auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType()); 7250fca6ea1SDimitry Andric auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType()); 7260fca6ea1SDimitry Andric if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) && 7270fca6ea1SDimitry Andric !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType())) 7280fca6ea1SDimitry Andric return false; 7290fca6ea1SDimitry Andric } 7300fca6ea1SDimitry Andric 7315ffd83dbSDimitry Andric SmallVector<int, 16> NewMask; 7325f757f3fSDimitry Andric if (DestEltSize <= SrcEltSize) { 7335ffd83dbSDimitry Andric // The bitcast is from wide to narrow/equal elements. The shuffle mask can 7345ffd83dbSDimitry Andric // always be expanded to the equivalent form choosing narrower elements. 7355f757f3fSDimitry Andric assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask"); 7365f757f3fSDimitry Andric unsigned ScaleFactor = SrcEltSize / DestEltSize; 7375ffd83dbSDimitry Andric narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 7385ffd83dbSDimitry Andric } else { 7395ffd83dbSDimitry Andric // The bitcast is from narrow elements to wide elements. The shuffle mask 7405ffd83dbSDimitry Andric // must choose consecutive elements to allow casting first. 7415f757f3fSDimitry Andric assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask"); 7425f757f3fSDimitry Andric unsigned ScaleFactor = DestEltSize / SrcEltSize; 7435ffd83dbSDimitry Andric if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 7445ffd83dbSDimitry Andric return false; 7455ffd83dbSDimitry Andric } 746fe6060f1SDimitry Andric 7475f757f3fSDimitry Andric // Bitcast the shuffle src - keep its original width but using the destination 7485f757f3fSDimitry Andric // scalar type. 7495f757f3fSDimitry Andric unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; 7500fca6ea1SDimitry Andric auto *NewShuffleTy = 7510fca6ea1SDimitry Andric FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); 7520fca6ea1SDimitry Andric auto *OldShuffleTy = 7530fca6ea1SDimitry Andric FixedVectorType::get(SrcTy->getScalarType(), Mask.size()); 7540fca6ea1SDimitry Andric unsigned NumOps = IsUnary ? 1 : 2; 7555f757f3fSDimitry Andric 7560fca6ea1SDimitry Andric // The new shuffle must not cost more than the old shuffle. 7570fca6ea1SDimitry Andric TargetTransformInfo::TargetCostKind CK = 7580fca6ea1SDimitry Andric TargetTransformInfo::TCK_RecipThroughput; 7590fca6ea1SDimitry Andric TargetTransformInfo::ShuffleKind SK = 7600fca6ea1SDimitry Andric IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc 7610fca6ea1SDimitry Andric : TargetTransformInfo::SK_PermuteTwoSrc; 7620fca6ea1SDimitry Andric 7630fca6ea1SDimitry Andric InstructionCost DestCost = 7640fca6ea1SDimitry Andric TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) + 7650fca6ea1SDimitry Andric (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, 7660fca6ea1SDimitry Andric TargetTransformInfo::CastContextHint::None, 7670fca6ea1SDimitry Andric CK)); 768fe6060f1SDimitry Andric InstructionCost SrcCost = 7690fca6ea1SDimitry Andric TTI.getShuffleCost(SK, SrcTy, Mask, CK) + 7700fca6ea1SDimitry Andric TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, 7710fca6ea1SDimitry Andric TargetTransformInfo::CastContextHint::None, CK); 772fe6060f1SDimitry Andric if (DestCost > SrcCost || !DestCost.isValid()) 773fe6060f1SDimitry Andric return false; 774fe6060f1SDimitry Andric 7750fca6ea1SDimitry Andric // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC' 7765ffd83dbSDimitry Andric ++NumShufOfBitcast; 7770fca6ea1SDimitry Andric Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy); 7780fca6ea1SDimitry Andric Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy); 7790fca6ea1SDimitry Andric Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask); 7805ffd83dbSDimitry Andric replaceValue(I, *Shuf); 7815ffd83dbSDimitry Andric return true; 7825ffd83dbSDimitry Andric } 7835ffd83dbSDimitry Andric 7845f757f3fSDimitry Andric /// VP Intrinsics whose vector operands are both splat values may be simplified 7855f757f3fSDimitry Andric /// into the scalar version of the operation and the result splatted. This 7865f757f3fSDimitry Andric /// can lead to scalarization down the line. 7875f757f3fSDimitry Andric bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { 7885f757f3fSDimitry Andric if (!isa<VPIntrinsic>(I)) 7895f757f3fSDimitry Andric return false; 7905f757f3fSDimitry Andric VPIntrinsic &VPI = cast<VPIntrinsic>(I); 7915f757f3fSDimitry Andric Value *Op0 = VPI.getArgOperand(0); 7925f757f3fSDimitry Andric Value *Op1 = VPI.getArgOperand(1); 7935f757f3fSDimitry Andric 7945f757f3fSDimitry Andric if (!isSplatValue(Op0) || !isSplatValue(Op1)) 7955f757f3fSDimitry Andric return false; 7965f757f3fSDimitry Andric 7975f757f3fSDimitry Andric // Check getSplatValue early in this function, to avoid doing unnecessary 7985f757f3fSDimitry Andric // work. 7995f757f3fSDimitry Andric Value *ScalarOp0 = getSplatValue(Op0); 8005f757f3fSDimitry Andric Value *ScalarOp1 = getSplatValue(Op1); 8015f757f3fSDimitry Andric if (!ScalarOp0 || !ScalarOp1) 8025f757f3fSDimitry Andric return false; 8035f757f3fSDimitry Andric 8045f757f3fSDimitry Andric // For the binary VP intrinsics supported here, the result on disabled lanes 8055f757f3fSDimitry Andric // is a poison value. For now, only do this simplification if all lanes 8065f757f3fSDimitry Andric // are active. 8075f757f3fSDimitry Andric // TODO: Relax the condition that all lanes are active by using insertelement 8085f757f3fSDimitry Andric // on inactive lanes. 8095f757f3fSDimitry Andric auto IsAllTrueMask = [](Value *MaskVal) { 8105f757f3fSDimitry Andric if (Value *SplattedVal = getSplatValue(MaskVal)) 8115f757f3fSDimitry Andric if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) 8125f757f3fSDimitry Andric return ConstValue->isAllOnesValue(); 8135f757f3fSDimitry Andric return false; 8145f757f3fSDimitry Andric }; 8155f757f3fSDimitry Andric if (!IsAllTrueMask(VPI.getArgOperand(2))) 8165f757f3fSDimitry Andric return false; 8175f757f3fSDimitry Andric 8185f757f3fSDimitry Andric // Check to make sure we support scalarization of the intrinsic 8195f757f3fSDimitry Andric Intrinsic::ID IntrID = VPI.getIntrinsicID(); 8205f757f3fSDimitry Andric if (!VPBinOpIntrinsic::isVPBinOp(IntrID)) 8215f757f3fSDimitry Andric return false; 8225f757f3fSDimitry Andric 8235f757f3fSDimitry Andric // Calculate cost of splatting both operands into vectors and the vector 8245f757f3fSDimitry Andric // intrinsic 8255f757f3fSDimitry Andric VectorType *VecTy = cast<VectorType>(VPI.getType()); 8265f757f3fSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 8270fca6ea1SDimitry Andric SmallVector<int> Mask; 8280fca6ea1SDimitry Andric if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy)) 8290fca6ea1SDimitry Andric Mask.resize(FVTy->getNumElements(), 0); 8305f757f3fSDimitry Andric InstructionCost SplatCost = 8315f757f3fSDimitry Andric TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + 8320fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask); 8335f757f3fSDimitry Andric 8345f757f3fSDimitry Andric // Calculate the cost of the VP Intrinsic 8355f757f3fSDimitry Andric SmallVector<Type *, 4> Args; 8365f757f3fSDimitry Andric for (Value *V : VPI.args()) 8375f757f3fSDimitry Andric Args.push_back(V->getType()); 8385f757f3fSDimitry Andric IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); 8395f757f3fSDimitry Andric InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 8405f757f3fSDimitry Andric InstructionCost OldCost = 2 * SplatCost + VectorOpCost; 8415f757f3fSDimitry Andric 8425f757f3fSDimitry Andric // Determine scalar opcode 8435f757f3fSDimitry Andric std::optional<unsigned> FunctionalOpcode = 8445f757f3fSDimitry Andric VPI.getFunctionalOpcode(); 8455f757f3fSDimitry Andric std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; 8465f757f3fSDimitry Andric if (!FunctionalOpcode) { 8475f757f3fSDimitry Andric ScalarIntrID = VPI.getFunctionalIntrinsicID(); 8485f757f3fSDimitry Andric if (!ScalarIntrID) 8495f757f3fSDimitry Andric return false; 8505f757f3fSDimitry Andric } 8515f757f3fSDimitry Andric 8525f757f3fSDimitry Andric // Calculate cost of scalarizing 8535f757f3fSDimitry Andric InstructionCost ScalarOpCost = 0; 8545f757f3fSDimitry Andric if (ScalarIntrID) { 8555f757f3fSDimitry Andric IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); 8565f757f3fSDimitry Andric ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 8575f757f3fSDimitry Andric } else { 8585f757f3fSDimitry Andric ScalarOpCost = 8595f757f3fSDimitry Andric TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType()); 8605f757f3fSDimitry Andric } 8615f757f3fSDimitry Andric 8625f757f3fSDimitry Andric // The existing splats may be kept around if other instructions use them. 8635f757f3fSDimitry Andric InstructionCost CostToKeepSplats = 8645f757f3fSDimitry Andric (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); 8655f757f3fSDimitry Andric InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; 8665f757f3fSDimitry Andric 8675f757f3fSDimitry Andric LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI 8685f757f3fSDimitry Andric << "\n"); 8695f757f3fSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost 8705f757f3fSDimitry Andric << ", Cost of scalarizing:" << NewCost << "\n"); 8715f757f3fSDimitry Andric 8725f757f3fSDimitry Andric // We want to scalarize unless the vector variant actually has lower cost. 8735f757f3fSDimitry Andric if (OldCost < NewCost || !NewCost.isValid()) 8745f757f3fSDimitry Andric return false; 8755f757f3fSDimitry Andric 8765f757f3fSDimitry Andric // Scalarize the intrinsic 8775f757f3fSDimitry Andric ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount(); 8785f757f3fSDimitry Andric Value *EVL = VPI.getArgOperand(3); 8795f757f3fSDimitry Andric 8805f757f3fSDimitry Andric // If the VP op might introduce UB or poison, we can scalarize it provided 8815f757f3fSDimitry Andric // that we know the EVL > 0: If the EVL is zero, then the original VP op 8825f757f3fSDimitry Andric // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by 8835f757f3fSDimitry Andric // scalarizing it. 8845f757f3fSDimitry Andric bool SafeToSpeculate; 8855f757f3fSDimitry Andric if (ScalarIntrID) 8865f757f3fSDimitry Andric SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID) 8875f757f3fSDimitry Andric .hasFnAttr(Attribute::AttrKind::Speculatable); 8885f757f3fSDimitry Andric else 8895f757f3fSDimitry Andric SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( 8905f757f3fSDimitry Andric *FunctionalOpcode, &VPI, nullptr, &AC, &DT); 8910fca6ea1SDimitry Andric if (!SafeToSpeculate && 8920fca6ea1SDimitry Andric !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI))) 8935f757f3fSDimitry Andric return false; 8945f757f3fSDimitry Andric 8955f757f3fSDimitry Andric Value *ScalarVal = 8965f757f3fSDimitry Andric ScalarIntrID 8975f757f3fSDimitry Andric ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID, 8985f757f3fSDimitry Andric {ScalarOp0, ScalarOp1}) 8995f757f3fSDimitry Andric : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode), 9005f757f3fSDimitry Andric ScalarOp0, ScalarOp1); 9015f757f3fSDimitry Andric 9025f757f3fSDimitry Andric replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal)); 9035f757f3fSDimitry Andric return true; 9045f757f3fSDimitry Andric } 9055f757f3fSDimitry Andric 9065ffd83dbSDimitry Andric /// Match a vector binop or compare instruction with at least one inserted 9075ffd83dbSDimitry Andric /// scalar operand and convert to scalar binop/cmp followed by insertelement. 9085ffd83dbSDimitry Andric bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 9095ffd83dbSDimitry Andric CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 9105ffd83dbSDimitry Andric Value *Ins0, *Ins1; 9115ffd83dbSDimitry Andric if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 9125ffd83dbSDimitry Andric !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 9135ffd83dbSDimitry Andric return false; 9145ffd83dbSDimitry Andric 9155ffd83dbSDimitry Andric // Do not convert the vector condition of a vector select into a scalar 9165ffd83dbSDimitry Andric // condition. That may cause problems for codegen because of differences in 9175ffd83dbSDimitry Andric // boolean formats and register-file transfers. 9185ffd83dbSDimitry Andric // TODO: Can we account for that in the cost model? 9195ffd83dbSDimitry Andric bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 9205ffd83dbSDimitry Andric if (IsCmp) 9215ffd83dbSDimitry Andric for (User *U : I.users()) 9225ffd83dbSDimitry Andric if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 9235ffd83dbSDimitry Andric return false; 9245ffd83dbSDimitry Andric 9255ffd83dbSDimitry Andric // Match against one or both scalar values being inserted into constant 9265ffd83dbSDimitry Andric // vectors: 9275ffd83dbSDimitry Andric // vec_op VecC0, (inselt VecC1, V1, Index) 9285ffd83dbSDimitry Andric // vec_op (inselt VecC0, V0, Index), VecC1 9295ffd83dbSDimitry Andric // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 9305ffd83dbSDimitry Andric // TODO: Deal with mismatched index constants and variable indexes? 9315ffd83dbSDimitry Andric Constant *VecC0 = nullptr, *VecC1 = nullptr; 9325ffd83dbSDimitry Andric Value *V0 = nullptr, *V1 = nullptr; 9335ffd83dbSDimitry Andric uint64_t Index0 = 0, Index1 = 0; 9345ffd83dbSDimitry Andric if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 9355ffd83dbSDimitry Andric m_ConstantInt(Index0))) && 9365ffd83dbSDimitry Andric !match(Ins0, m_Constant(VecC0))) 9375ffd83dbSDimitry Andric return false; 9385ffd83dbSDimitry Andric if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 9395ffd83dbSDimitry Andric m_ConstantInt(Index1))) && 9405ffd83dbSDimitry Andric !match(Ins1, m_Constant(VecC1))) 9415ffd83dbSDimitry Andric return false; 9425ffd83dbSDimitry Andric 9435ffd83dbSDimitry Andric bool IsConst0 = !V0; 9445ffd83dbSDimitry Andric bool IsConst1 = !V1; 9455ffd83dbSDimitry Andric if (IsConst0 && IsConst1) 9465ffd83dbSDimitry Andric return false; 9475ffd83dbSDimitry Andric if (!IsConst0 && !IsConst1 && Index0 != Index1) 9485ffd83dbSDimitry Andric return false; 9495ffd83dbSDimitry Andric 9505ffd83dbSDimitry Andric // Bail for single insertion if it is a load. 9515ffd83dbSDimitry Andric // TODO: Handle this once getVectorInstrCost can cost for load/stores. 9525ffd83dbSDimitry Andric auto *I0 = dyn_cast_or_null<Instruction>(V0); 9535ffd83dbSDimitry Andric auto *I1 = dyn_cast_or_null<Instruction>(V1); 9545ffd83dbSDimitry Andric if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 9555ffd83dbSDimitry Andric (IsConst1 && I0 && I0->mayReadFromMemory())) 9565ffd83dbSDimitry Andric return false; 9575ffd83dbSDimitry Andric 9585ffd83dbSDimitry Andric uint64_t Index = IsConst0 ? Index1 : Index0; 9595ffd83dbSDimitry Andric Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 9605ffd83dbSDimitry Andric Type *VecTy = I.getType(); 9615ffd83dbSDimitry Andric assert(VecTy->isVectorTy() && 9625ffd83dbSDimitry Andric (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 9635ffd83dbSDimitry Andric (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 9645ffd83dbSDimitry Andric ScalarTy->isPointerTy()) && 9655ffd83dbSDimitry Andric "Unexpected types for insert element into binop or cmp"); 9665ffd83dbSDimitry Andric 9675ffd83dbSDimitry Andric unsigned Opcode = I.getOpcode(); 968e8d8bef9SDimitry Andric InstructionCost ScalarOpCost, VectorOpCost; 9695ffd83dbSDimitry Andric if (IsCmp) { 970349cc55cSDimitry Andric CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 971349cc55cSDimitry Andric ScalarOpCost = TTI.getCmpSelInstrCost( 972349cc55cSDimitry Andric Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 973349cc55cSDimitry Andric VectorOpCost = TTI.getCmpSelInstrCost( 974349cc55cSDimitry Andric Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 9755ffd83dbSDimitry Andric } else { 9765ffd83dbSDimitry Andric ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 9775ffd83dbSDimitry Andric VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 9785ffd83dbSDimitry Andric } 9795ffd83dbSDimitry Andric 9805ffd83dbSDimitry Andric // Get cost estimate for the insert element. This cost will factor into 9815ffd83dbSDimitry Andric // both sequences. 982bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 983bdd1243dSDimitry Andric InstructionCost InsertCost = TTI.getVectorInstrCost( 984bdd1243dSDimitry Andric Instruction::InsertElement, VecTy, CostKind, Index); 985e8d8bef9SDimitry Andric InstructionCost OldCost = 986e8d8bef9SDimitry Andric (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 987e8d8bef9SDimitry Andric InstructionCost NewCost = ScalarOpCost + InsertCost + 9885ffd83dbSDimitry Andric (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 9895ffd83dbSDimitry Andric (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 9905ffd83dbSDimitry Andric 9915ffd83dbSDimitry Andric // We want to scalarize unless the vector variant actually has lower cost. 992e8d8bef9SDimitry Andric if (OldCost < NewCost || !NewCost.isValid()) 9935ffd83dbSDimitry Andric return false; 9945ffd83dbSDimitry Andric 9955ffd83dbSDimitry Andric // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 9965ffd83dbSDimitry Andric // inselt NewVecC, (scalar_op V0, V1), Index 9975ffd83dbSDimitry Andric if (IsCmp) 9985ffd83dbSDimitry Andric ++NumScalarCmp; 9995ffd83dbSDimitry Andric else 10005ffd83dbSDimitry Andric ++NumScalarBO; 10015ffd83dbSDimitry Andric 10025ffd83dbSDimitry Andric // For constant cases, extract the scalar element, this should constant fold. 10035ffd83dbSDimitry Andric if (IsConst0) 10045ffd83dbSDimitry Andric V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 10055ffd83dbSDimitry Andric if (IsConst1) 10065ffd83dbSDimitry Andric V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 10075ffd83dbSDimitry Andric 10085ffd83dbSDimitry Andric Value *Scalar = 10095ffd83dbSDimitry Andric IsCmp ? Builder.CreateCmp(Pred, V0, V1) 10105ffd83dbSDimitry Andric : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 10115ffd83dbSDimitry Andric 10125ffd83dbSDimitry Andric Scalar->setName(I.getName() + ".scalar"); 10135ffd83dbSDimitry Andric 10145ffd83dbSDimitry Andric // All IR flags are safe to back-propagate. There is no potential for extra 10155ffd83dbSDimitry Andric // poison to be created by the scalar instruction. 10165ffd83dbSDimitry Andric if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 10175ffd83dbSDimitry Andric ScalarInst->copyIRFlags(&I); 10185ffd83dbSDimitry Andric 10195ffd83dbSDimitry Andric // Fold the vector constants in the original vectors into a new base vector. 102081ad6265SDimitry Andric Value *NewVecC = 102181ad6265SDimitry Andric IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) 102281ad6265SDimitry Andric : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); 10235ffd83dbSDimitry Andric Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 10245ffd83dbSDimitry Andric replaceValue(I, *Insert); 10255ffd83dbSDimitry Andric return true; 10265ffd83dbSDimitry Andric } 10275ffd83dbSDimitry Andric 10285ffd83dbSDimitry Andric /// Try to combine a scalar binop + 2 scalar compares of extracted elements of 10295ffd83dbSDimitry Andric /// a vector into vector operations followed by extract. Note: The SLP pass 10305ffd83dbSDimitry Andric /// may miss this pattern because of implementation problems. 10315ffd83dbSDimitry Andric bool VectorCombine::foldExtractedCmps(Instruction &I) { 10325ffd83dbSDimitry Andric // We are looking for a scalar binop of booleans. 10335ffd83dbSDimitry Andric // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 10345ffd83dbSDimitry Andric if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) 10355ffd83dbSDimitry Andric return false; 10365ffd83dbSDimitry Andric 10375ffd83dbSDimitry Andric // The compare predicates should match, and each compare should have a 10385ffd83dbSDimitry Andric // constant operand. 10395ffd83dbSDimitry Andric // TODO: Relax the one-use constraints. 10405ffd83dbSDimitry Andric Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 10415ffd83dbSDimitry Andric Instruction *I0, *I1; 10425ffd83dbSDimitry Andric Constant *C0, *C1; 10435ffd83dbSDimitry Andric CmpInst::Predicate P0, P1; 10445ffd83dbSDimitry Andric if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || 10455ffd83dbSDimitry Andric !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || 10465ffd83dbSDimitry Andric P0 != P1) 10475ffd83dbSDimitry Andric return false; 10485ffd83dbSDimitry Andric 10495ffd83dbSDimitry Andric // The compare operands must be extracts of the same vector with constant 10505ffd83dbSDimitry Andric // extract indexes. 10515ffd83dbSDimitry Andric // TODO: Relax the one-use constraints. 10525ffd83dbSDimitry Andric Value *X; 10535ffd83dbSDimitry Andric uint64_t Index0, Index1; 10545ffd83dbSDimitry Andric if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || 10555ffd83dbSDimitry Andric !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) 10565ffd83dbSDimitry Andric return false; 10575ffd83dbSDimitry Andric 10585ffd83dbSDimitry Andric auto *Ext0 = cast<ExtractElementInst>(I0); 10595ffd83dbSDimitry Andric auto *Ext1 = cast<ExtractElementInst>(I1); 10605ffd83dbSDimitry Andric ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); 10615ffd83dbSDimitry Andric if (!ConvertToShuf) 10625ffd83dbSDimitry Andric return false; 10635ffd83dbSDimitry Andric 10645ffd83dbSDimitry Andric // The original scalar pattern is: 10655ffd83dbSDimitry Andric // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 10665ffd83dbSDimitry Andric CmpInst::Predicate Pred = P0; 10675ffd83dbSDimitry Andric unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp 10685ffd83dbSDimitry Andric : Instruction::ICmp; 10695ffd83dbSDimitry Andric auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 10705ffd83dbSDimitry Andric if (!VecTy) 10715ffd83dbSDimitry Andric return false; 10725ffd83dbSDimitry Andric 1073bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1074e8d8bef9SDimitry Andric InstructionCost OldCost = 1075bdd1243dSDimitry Andric TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 1076bdd1243dSDimitry Andric OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 1077349cc55cSDimitry Andric OldCost += 1078349cc55cSDimitry Andric TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), 1079349cc55cSDimitry Andric CmpInst::makeCmpResultType(I0->getType()), Pred) * 1080349cc55cSDimitry Andric 2; 10815ffd83dbSDimitry Andric OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); 10825ffd83dbSDimitry Andric 10835ffd83dbSDimitry Andric // The proposed vector pattern is: 10845ffd83dbSDimitry Andric // vcmp = cmp Pred X, VecC 10855ffd83dbSDimitry Andric // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 10865ffd83dbSDimitry Andric int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 10875ffd83dbSDimitry Andric int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 10885ffd83dbSDimitry Andric auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 1089349cc55cSDimitry Andric InstructionCost NewCost = TTI.getCmpSelInstrCost( 1090349cc55cSDimitry Andric CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); 109106c3fb27SDimitry Andric SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 1092fe6060f1SDimitry Andric ShufMask[CheapIndex] = ExpensiveIndex; 1093fe6060f1SDimitry Andric NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, 1094fe6060f1SDimitry Andric ShufMask); 10955ffd83dbSDimitry Andric NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); 1096bdd1243dSDimitry Andric NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); 10975ffd83dbSDimitry Andric 10985ffd83dbSDimitry Andric // Aggressively form vector ops if the cost is equal because the transform 10995ffd83dbSDimitry Andric // may enable further optimization. 11005ffd83dbSDimitry Andric // Codegen can reverse this transform (scalarize) if it was not profitable. 1101e8d8bef9SDimitry Andric if (OldCost < NewCost || !NewCost.isValid()) 11025ffd83dbSDimitry Andric return false; 11035ffd83dbSDimitry Andric 11045ffd83dbSDimitry Andric // Create a vector constant from the 2 scalar constants. 11055ffd83dbSDimitry Andric SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 110606c3fb27SDimitry Andric PoisonValue::get(VecTy->getElementType())); 11075ffd83dbSDimitry Andric CmpC[Index0] = C0; 11085ffd83dbSDimitry Andric CmpC[Index1] = C1; 11095ffd83dbSDimitry Andric Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 11105ffd83dbSDimitry Andric 11115ffd83dbSDimitry Andric Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 11125ffd83dbSDimitry Andric Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), 11135ffd83dbSDimitry Andric VCmp, Shuf); 11145ffd83dbSDimitry Andric Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 11155ffd83dbSDimitry Andric replaceValue(I, *NewExt); 11165ffd83dbSDimitry Andric ++NumVecCmpBO; 11175ffd83dbSDimitry Andric return true; 11185ffd83dbSDimitry Andric } 11195ffd83dbSDimitry Andric 1120fe6060f1SDimitry Andric // Check if memory loc modified between two instrs in the same BB 1121fe6060f1SDimitry Andric static bool isMemModifiedBetween(BasicBlock::iterator Begin, 1122fe6060f1SDimitry Andric BasicBlock::iterator End, 1123fe6060f1SDimitry Andric const MemoryLocation &Loc, AAResults &AA) { 1124fe6060f1SDimitry Andric unsigned NumScanned = 0; 1125fe6060f1SDimitry Andric return std::any_of(Begin, End, [&](const Instruction &Instr) { 1126fe6060f1SDimitry Andric return isModSet(AA.getModRefInfo(&Instr, Loc)) || 1127fe6060f1SDimitry Andric ++NumScanned > MaxInstrsToScan; 1128fe6060f1SDimitry Andric }); 1129fe6060f1SDimitry Andric } 1130fe6060f1SDimitry Andric 1131bdd1243dSDimitry Andric namespace { 1132349cc55cSDimitry Andric /// Helper class to indicate whether a vector index can be safely scalarized and 1133349cc55cSDimitry Andric /// if a freeze needs to be inserted. 1134349cc55cSDimitry Andric class ScalarizationResult { 1135349cc55cSDimitry Andric enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; 1136349cc55cSDimitry Andric 1137349cc55cSDimitry Andric StatusTy Status; 1138349cc55cSDimitry Andric Value *ToFreeze; 1139349cc55cSDimitry Andric 1140349cc55cSDimitry Andric ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) 1141349cc55cSDimitry Andric : Status(Status), ToFreeze(ToFreeze) {} 1142349cc55cSDimitry Andric 1143349cc55cSDimitry Andric public: 1144349cc55cSDimitry Andric ScalarizationResult(const ScalarizationResult &Other) = default; 1145349cc55cSDimitry Andric ~ScalarizationResult() { 1146349cc55cSDimitry Andric assert(!ToFreeze && "freeze() not called with ToFreeze being set"); 1147349cc55cSDimitry Andric } 1148349cc55cSDimitry Andric 1149349cc55cSDimitry Andric static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } 1150349cc55cSDimitry Andric static ScalarizationResult safe() { return {StatusTy::Safe}; } 1151349cc55cSDimitry Andric static ScalarizationResult safeWithFreeze(Value *ToFreeze) { 1152349cc55cSDimitry Andric return {StatusTy::SafeWithFreeze, ToFreeze}; 1153349cc55cSDimitry Andric } 1154349cc55cSDimitry Andric 1155349cc55cSDimitry Andric /// Returns true if the index can be scalarize without requiring a freeze. 1156349cc55cSDimitry Andric bool isSafe() const { return Status == StatusTy::Safe; } 1157349cc55cSDimitry Andric /// Returns true if the index cannot be scalarized. 1158349cc55cSDimitry Andric bool isUnsafe() const { return Status == StatusTy::Unsafe; } 1159349cc55cSDimitry Andric /// Returns true if the index can be scalarize, but requires inserting a 1160349cc55cSDimitry Andric /// freeze. 1161349cc55cSDimitry Andric bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } 1162349cc55cSDimitry Andric 1163349cc55cSDimitry Andric /// Reset the state of Unsafe and clear ToFreze if set. 1164349cc55cSDimitry Andric void discard() { 1165349cc55cSDimitry Andric ToFreeze = nullptr; 1166349cc55cSDimitry Andric Status = StatusTy::Unsafe; 1167349cc55cSDimitry Andric } 1168349cc55cSDimitry Andric 1169349cc55cSDimitry Andric /// Freeze the ToFreeze and update the use in \p User to use it. 1170349cc55cSDimitry Andric void freeze(IRBuilder<> &Builder, Instruction &UserI) { 1171349cc55cSDimitry Andric assert(isSafeWithFreeze() && 1172349cc55cSDimitry Andric "should only be used when freezing is required"); 1173349cc55cSDimitry Andric assert(is_contained(ToFreeze->users(), &UserI) && 1174349cc55cSDimitry Andric "UserI must be a user of ToFreeze"); 1175349cc55cSDimitry Andric IRBuilder<>::InsertPointGuard Guard(Builder); 1176349cc55cSDimitry Andric Builder.SetInsertPoint(cast<Instruction>(&UserI)); 1177349cc55cSDimitry Andric Value *Frozen = 1178349cc55cSDimitry Andric Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); 1179349cc55cSDimitry Andric for (Use &U : make_early_inc_range((UserI.operands()))) 1180349cc55cSDimitry Andric if (U.get() == ToFreeze) 1181349cc55cSDimitry Andric U.set(Frozen); 1182349cc55cSDimitry Andric 1183349cc55cSDimitry Andric ToFreeze = nullptr; 1184349cc55cSDimitry Andric } 1185349cc55cSDimitry Andric }; 1186bdd1243dSDimitry Andric } // namespace 1187349cc55cSDimitry Andric 1188fe6060f1SDimitry Andric /// Check if it is legal to scalarize a memory access to \p VecTy at index \p 1189fe6060f1SDimitry Andric /// Idx. \p Idx must access a valid vector element. 11905f757f3fSDimitry Andric static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, 11915f757f3fSDimitry Andric Instruction *CtxI, 1192349cc55cSDimitry Andric AssumptionCache &AC, 1193349cc55cSDimitry Andric const DominatorTree &DT) { 11945f757f3fSDimitry Andric // We do checks for both fixed vector types and scalable vector types. 11955f757f3fSDimitry Andric // This is the number of elements of fixed vector types, 11965f757f3fSDimitry Andric // or the minimum number of elements of scalable vector types. 11975f757f3fSDimitry Andric uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); 11985f757f3fSDimitry Andric 1199349cc55cSDimitry Andric if (auto *C = dyn_cast<ConstantInt>(Idx)) { 12005f757f3fSDimitry Andric if (C->getValue().ult(NumElements)) 1201349cc55cSDimitry Andric return ScalarizationResult::safe(); 1202349cc55cSDimitry Andric return ScalarizationResult::unsafe(); 1203349cc55cSDimitry Andric } 1204fe6060f1SDimitry Andric 1205349cc55cSDimitry Andric unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); 1206349cc55cSDimitry Andric APInt Zero(IntWidth, 0); 12075f757f3fSDimitry Andric APInt MaxElts(IntWidth, NumElements); 1208fe6060f1SDimitry Andric ConstantRange ValidIndices(Zero, MaxElts); 1209349cc55cSDimitry Andric ConstantRange IdxRange(IntWidth, true); 1210349cc55cSDimitry Andric 1211349cc55cSDimitry Andric if (isGuaranteedNotToBePoison(Idx, &AC)) { 121204eeddc0SDimitry Andric if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, 121304eeddc0SDimitry Andric true, &AC, CtxI, &DT))) 1214349cc55cSDimitry Andric return ScalarizationResult::safe(); 1215349cc55cSDimitry Andric return ScalarizationResult::unsafe(); 1216349cc55cSDimitry Andric } 1217349cc55cSDimitry Andric 1218349cc55cSDimitry Andric // If the index may be poison, check if we can insert a freeze before the 1219349cc55cSDimitry Andric // range of the index is restricted. 1220349cc55cSDimitry Andric Value *IdxBase; 1221349cc55cSDimitry Andric ConstantInt *CI; 1222349cc55cSDimitry Andric if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { 1223349cc55cSDimitry Andric IdxRange = IdxRange.binaryAnd(CI->getValue()); 1224349cc55cSDimitry Andric } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { 1225349cc55cSDimitry Andric IdxRange = IdxRange.urem(CI->getValue()); 1226349cc55cSDimitry Andric } 1227349cc55cSDimitry Andric 1228349cc55cSDimitry Andric if (ValidIndices.contains(IdxRange)) 1229349cc55cSDimitry Andric return ScalarizationResult::safeWithFreeze(IdxBase); 1230349cc55cSDimitry Andric return ScalarizationResult::unsafe(); 1231fe6060f1SDimitry Andric } 1232fe6060f1SDimitry Andric 1233fe6060f1SDimitry Andric /// The memory operation on a vector of \p ScalarType had alignment of 1234fe6060f1SDimitry Andric /// \p VectorAlignment. Compute the maximal, but conservatively correct, 1235fe6060f1SDimitry Andric /// alignment that will be valid for the memory operation on a single scalar 1236fe6060f1SDimitry Andric /// element of the same type with index \p Idx. 1237fe6060f1SDimitry Andric static Align computeAlignmentAfterScalarization(Align VectorAlignment, 1238fe6060f1SDimitry Andric Type *ScalarType, Value *Idx, 1239fe6060f1SDimitry Andric const DataLayout &DL) { 1240fe6060f1SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(Idx)) 1241fe6060f1SDimitry Andric return commonAlignment(VectorAlignment, 1242fe6060f1SDimitry Andric C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); 1243fe6060f1SDimitry Andric return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); 1244fe6060f1SDimitry Andric } 1245fe6060f1SDimitry Andric 1246fe6060f1SDimitry Andric // Combine patterns like: 1247fe6060f1SDimitry Andric // %0 = load <4 x i32>, <4 x i32>* %a 1248fe6060f1SDimitry Andric // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 1249fe6060f1SDimitry Andric // store <4 x i32> %1, <4 x i32>* %a 1250fe6060f1SDimitry Andric // to: 1251fe6060f1SDimitry Andric // %0 = bitcast <4 x i32>* %a to i32* 1252fe6060f1SDimitry Andric // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 1253fe6060f1SDimitry Andric // store i32 %b, i32* %1 1254fe6060f1SDimitry Andric bool VectorCombine::foldSingleElementStore(Instruction &I) { 1255bdd1243dSDimitry Andric auto *SI = cast<StoreInst>(&I); 12565f757f3fSDimitry Andric if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType())) 1257fe6060f1SDimitry Andric return false; 1258fe6060f1SDimitry Andric 1259fe6060f1SDimitry Andric // TODO: Combine more complicated patterns (multiple insert) by referencing 1260fe6060f1SDimitry Andric // TargetTransformInfo. 1261fe6060f1SDimitry Andric Instruction *Source; 1262fe6060f1SDimitry Andric Value *NewElement; 1263fe6060f1SDimitry Andric Value *Idx; 1264fe6060f1SDimitry Andric if (!match(SI->getValueOperand(), 1265fe6060f1SDimitry Andric m_InsertElt(m_Instruction(Source), m_Value(NewElement), 1266fe6060f1SDimitry Andric m_Value(Idx)))) 1267fe6060f1SDimitry Andric return false; 1268fe6060f1SDimitry Andric 1269fe6060f1SDimitry Andric if (auto *Load = dyn_cast<LoadInst>(Source)) { 12705f757f3fSDimitry Andric auto VecTy = cast<VectorType>(SI->getValueOperand()->getType()); 1271fe6060f1SDimitry Andric Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); 1272fe6060f1SDimitry Andric // Don't optimize for atomic/volatile load or store. Ensure memory is not 1273fe6060f1SDimitry Andric // modified between, vector type matches store size, and index is inbounds. 1274fe6060f1SDimitry Andric if (!Load->isSimple() || Load->getParent() != SI->getParent() || 12750fca6ea1SDimitry Andric !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) || 1276349cc55cSDimitry Andric SrcAddr != SI->getPointerOperand()->stripPointerCasts()) 1277349cc55cSDimitry Andric return false; 1278349cc55cSDimitry Andric 1279349cc55cSDimitry Andric auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); 1280349cc55cSDimitry Andric if (ScalarizableIdx.isUnsafe() || 1281fe6060f1SDimitry Andric isMemModifiedBetween(Load->getIterator(), SI->getIterator(), 1282fe6060f1SDimitry Andric MemoryLocation::get(SI), AA)) 1283fe6060f1SDimitry Andric return false; 1284fe6060f1SDimitry Andric 1285349cc55cSDimitry Andric if (ScalarizableIdx.isSafeWithFreeze()) 1286349cc55cSDimitry Andric ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); 1287fe6060f1SDimitry Andric Value *GEP = Builder.CreateInBoundsGEP( 1288fe6060f1SDimitry Andric SI->getValueOperand()->getType(), SI->getPointerOperand(), 1289fe6060f1SDimitry Andric {ConstantInt::get(Idx->getType(), 0), Idx}); 1290fe6060f1SDimitry Andric StoreInst *NSI = Builder.CreateStore(NewElement, GEP); 1291fe6060f1SDimitry Andric NSI->copyMetadata(*SI); 1292fe6060f1SDimitry Andric Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1293fe6060f1SDimitry Andric std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, 12940fca6ea1SDimitry Andric *DL); 1295fe6060f1SDimitry Andric NSI->setAlignment(ScalarOpAlignment); 1296fe6060f1SDimitry Andric replaceValue(I, *NSI); 1297349cc55cSDimitry Andric eraseInstruction(I); 1298fe6060f1SDimitry Andric return true; 1299fe6060f1SDimitry Andric } 1300fe6060f1SDimitry Andric 1301fe6060f1SDimitry Andric return false; 1302fe6060f1SDimitry Andric } 1303fe6060f1SDimitry Andric 1304fe6060f1SDimitry Andric /// Try to scalarize vector loads feeding extractelement instructions. 1305fe6060f1SDimitry Andric bool VectorCombine::scalarizeLoadExtract(Instruction &I) { 1306fe6060f1SDimitry Andric Value *Ptr; 1307349cc55cSDimitry Andric if (!match(&I, m_Load(m_Value(Ptr)))) 1308fe6060f1SDimitry Andric return false; 1309fe6060f1SDimitry Andric 13105f757f3fSDimitry Andric auto *VecTy = cast<VectorType>(I.getType()); 1311349cc55cSDimitry Andric auto *LI = cast<LoadInst>(&I); 13120fca6ea1SDimitry Andric if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) 1313fe6060f1SDimitry Andric return false; 1314fe6060f1SDimitry Andric 13150eae32dcSDimitry Andric InstructionCost OriginalCost = 13165f757f3fSDimitry Andric TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), 1317fe6060f1SDimitry Andric LI->getPointerAddressSpace()); 1318fe6060f1SDimitry Andric InstructionCost ScalarizedCost = 0; 1319fe6060f1SDimitry Andric 1320fe6060f1SDimitry Andric Instruction *LastCheckedInst = LI; 1321fe6060f1SDimitry Andric unsigned NumInstChecked = 0; 13225f757f3fSDimitry Andric DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; 13235f757f3fSDimitry Andric auto FailureGuard = make_scope_exit([&]() { 13245f757f3fSDimitry Andric // If the transform is aborted, discard the ScalarizationResults. 13255f757f3fSDimitry Andric for (auto &Pair : NeedFreeze) 13265f757f3fSDimitry Andric Pair.second.discard(); 13275f757f3fSDimitry Andric }); 13285f757f3fSDimitry Andric 1329fe6060f1SDimitry Andric // Check if all users of the load are extracts with no memory modifications 1330fe6060f1SDimitry Andric // between the load and the extract. Compute the cost of both the original 1331fe6060f1SDimitry Andric // code and the scalarized version. 1332fe6060f1SDimitry Andric for (User *U : LI->users()) { 1333fe6060f1SDimitry Andric auto *UI = dyn_cast<ExtractElementInst>(U); 1334fe6060f1SDimitry Andric if (!UI || UI->getParent() != LI->getParent()) 1335fe6060f1SDimitry Andric return false; 1336fe6060f1SDimitry Andric 1337fe6060f1SDimitry Andric // Check if any instruction between the load and the extract may modify 1338fe6060f1SDimitry Andric // memory. 1339fe6060f1SDimitry Andric if (LastCheckedInst->comesBefore(UI)) { 1340fe6060f1SDimitry Andric for (Instruction &I : 1341fe6060f1SDimitry Andric make_range(std::next(LI->getIterator()), UI->getIterator())) { 1342fe6060f1SDimitry Andric // Bail out if we reached the check limit or the instruction may write 1343fe6060f1SDimitry Andric // to memory. 1344fe6060f1SDimitry Andric if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) 1345fe6060f1SDimitry Andric return false; 1346fe6060f1SDimitry Andric NumInstChecked++; 1347fe6060f1SDimitry Andric } 134881ad6265SDimitry Andric LastCheckedInst = UI; 1349fe6060f1SDimitry Andric } 1350fe6060f1SDimitry Andric 13515f757f3fSDimitry Andric auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); 13525f757f3fSDimitry Andric if (ScalarIdx.isUnsafe()) 1353fe6060f1SDimitry Andric return false; 13545f757f3fSDimitry Andric if (ScalarIdx.isSafeWithFreeze()) { 13555f757f3fSDimitry Andric NeedFreeze.try_emplace(UI, ScalarIdx); 13565f757f3fSDimitry Andric ScalarIdx.discard(); 1357349cc55cSDimitry Andric } 1358fe6060f1SDimitry Andric 1359fe6060f1SDimitry Andric auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); 1360bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1361fe6060f1SDimitry Andric OriginalCost += 13625f757f3fSDimitry Andric TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, 1363fe6060f1SDimitry Andric Index ? Index->getZExtValue() : -1); 1364fe6060f1SDimitry Andric ScalarizedCost += 13655f757f3fSDimitry Andric TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), 1366fe6060f1SDimitry Andric Align(1), LI->getPointerAddressSpace()); 13675f757f3fSDimitry Andric ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); 1368fe6060f1SDimitry Andric } 1369fe6060f1SDimitry Andric 1370fe6060f1SDimitry Andric if (ScalarizedCost >= OriginalCost) 1371fe6060f1SDimitry Andric return false; 1372fe6060f1SDimitry Andric 1373fe6060f1SDimitry Andric // Replace extracts with narrow scalar loads. 1374fe6060f1SDimitry Andric for (User *U : LI->users()) { 1375fe6060f1SDimitry Andric auto *EI = cast<ExtractElementInst>(U); 1376fe6060f1SDimitry Andric Value *Idx = EI->getOperand(1); 13775f757f3fSDimitry Andric 13785f757f3fSDimitry Andric // Insert 'freeze' for poison indexes. 13795f757f3fSDimitry Andric auto It = NeedFreeze.find(EI); 13805f757f3fSDimitry Andric if (It != NeedFreeze.end()) 13815f757f3fSDimitry Andric It->second.freeze(Builder, *cast<Instruction>(Idx)); 13825f757f3fSDimitry Andric 13835f757f3fSDimitry Andric Builder.SetInsertPoint(EI); 1384fe6060f1SDimitry Andric Value *GEP = 13855f757f3fSDimitry Andric Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); 1386fe6060f1SDimitry Andric auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( 13875f757f3fSDimitry Andric VecTy->getElementType(), GEP, EI->getName() + ".scalar")); 1388fe6060f1SDimitry Andric 1389fe6060f1SDimitry Andric Align ScalarOpAlignment = computeAlignmentAfterScalarization( 13900fca6ea1SDimitry Andric LI->getAlign(), VecTy->getElementType(), Idx, *DL); 1391fe6060f1SDimitry Andric NewLoad->setAlignment(ScalarOpAlignment); 1392fe6060f1SDimitry Andric 1393fe6060f1SDimitry Andric replaceValue(*EI, *NewLoad); 1394fe6060f1SDimitry Andric } 1395fe6060f1SDimitry Andric 13965f757f3fSDimitry Andric FailureGuard.release(); 1397fe6060f1SDimitry Andric return true; 1398fe6060f1SDimitry Andric } 1399fe6060f1SDimitry Andric 14000fca6ea1SDimitry Andric /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". 1401349cc55cSDimitry Andric bool VectorCombine::foldShuffleOfBinops(Instruction &I) { 1402349cc55cSDimitry Andric BinaryOperator *B0, *B1; 14030fca6ea1SDimitry Andric ArrayRef<int> OldMask; 1404349cc55cSDimitry Andric if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), 14050fca6ea1SDimitry Andric m_Mask(OldMask)))) 1406349cc55cSDimitry Andric return false; 1407349cc55cSDimitry Andric 14080fca6ea1SDimitry Andric // Don't introduce poison into div/rem. 14090fca6ea1SDimitry Andric if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) && 14100fca6ea1SDimitry Andric B0->isIntDivRem()) 1411349cc55cSDimitry Andric return false; 1412349cc55cSDimitry Andric 14130fca6ea1SDimitry Andric // TODO: Add support for addlike etc. 14140fca6ea1SDimitry Andric Instruction::BinaryOps Opcode = B0->getOpcode(); 14150fca6ea1SDimitry Andric if (Opcode != B1->getOpcode()) 14160fca6ea1SDimitry Andric return false; 14170fca6ea1SDimitry Andric 14180fca6ea1SDimitry Andric auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 14190fca6ea1SDimitry Andric auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType()); 14200fca6ea1SDimitry Andric if (!ShuffleDstTy || !BinOpTy) 14210fca6ea1SDimitry Andric return false; 14220fca6ea1SDimitry Andric 14230fca6ea1SDimitry Andric unsigned NumSrcElts = BinOpTy->getNumElements(); 14240fca6ea1SDimitry Andric 1425349cc55cSDimitry Andric // If we have something like "add X, Y" and "add Z, X", swap ops to match. 1426349cc55cSDimitry Andric Value *X = B0->getOperand(0), *Y = B0->getOperand(1); 1427349cc55cSDimitry Andric Value *Z = B1->getOperand(0), *W = B1->getOperand(1); 14280fca6ea1SDimitry Andric if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W && 14290fca6ea1SDimitry Andric (X == W || Y == Z)) 1430349cc55cSDimitry Andric std::swap(X, Y); 1431349cc55cSDimitry Andric 14320fca6ea1SDimitry Andric auto ConvertToUnary = [NumSrcElts](int &M) { 14330fca6ea1SDimitry Andric if (M >= (int)NumSrcElts) 14340fca6ea1SDimitry Andric M -= NumSrcElts; 14350fca6ea1SDimitry Andric }; 14360fca6ea1SDimitry Andric 14370fca6ea1SDimitry Andric SmallVector<int> NewMask0(OldMask.begin(), OldMask.end()); 14380fca6ea1SDimitry Andric TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc; 1439349cc55cSDimitry Andric if (X == Z) { 14400fca6ea1SDimitry Andric llvm::for_each(NewMask0, ConvertToUnary); 14410fca6ea1SDimitry Andric SK0 = TargetTransformInfo::SK_PermuteSingleSrc; 14420fca6ea1SDimitry Andric Z = PoisonValue::get(BinOpTy); 1443349cc55cSDimitry Andric } 1444349cc55cSDimitry Andric 14450fca6ea1SDimitry Andric SmallVector<int> NewMask1(OldMask.begin(), OldMask.end()); 14460fca6ea1SDimitry Andric TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc; 14470fca6ea1SDimitry Andric if (Y == W) { 14480fca6ea1SDimitry Andric llvm::for_each(NewMask1, ConvertToUnary); 14490fca6ea1SDimitry Andric SK1 = TargetTransformInfo::SK_PermuteSingleSrc; 14500fca6ea1SDimitry Andric W = PoisonValue::get(BinOpTy); 14510fca6ea1SDimitry Andric } 14520fca6ea1SDimitry Andric 14530fca6ea1SDimitry Andric // Try to replace a binop with a shuffle if the shuffle is not costly. 14540fca6ea1SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 14550fca6ea1SDimitry Andric 14560fca6ea1SDimitry Andric InstructionCost OldCost = 14570fca6ea1SDimitry Andric TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) + 14580fca6ea1SDimitry Andric TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) + 14590fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, 14600fca6ea1SDimitry Andric OldMask, CostKind, 0, nullptr, {B0, B1}, &I); 14610fca6ea1SDimitry Andric 14620fca6ea1SDimitry Andric InstructionCost NewCost = 14630fca6ea1SDimitry Andric TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + 14640fca6ea1SDimitry Andric TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) + 14650fca6ea1SDimitry Andric TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); 14660fca6ea1SDimitry Andric 14670fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I 14680fca6ea1SDimitry Andric << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 14690fca6ea1SDimitry Andric << "\n"); 14700fca6ea1SDimitry Andric if (NewCost >= OldCost) 14710fca6ea1SDimitry Andric return false; 14720fca6ea1SDimitry Andric 14730fca6ea1SDimitry Andric Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); 14740fca6ea1SDimitry Andric Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); 1475349cc55cSDimitry Andric Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); 14760fca6ea1SDimitry Andric 1477349cc55cSDimitry Andric // Intersect flags from the old binops. 1478349cc55cSDimitry Andric if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { 1479349cc55cSDimitry Andric NewInst->copyIRFlags(B0); 1480349cc55cSDimitry Andric NewInst->andIRFlags(B1); 1481349cc55cSDimitry Andric } 14820fca6ea1SDimitry Andric 14830fca6ea1SDimitry Andric Worklist.pushValue(Shuf0); 14840fca6ea1SDimitry Andric Worklist.pushValue(Shuf1); 1485349cc55cSDimitry Andric replaceValue(I, *NewBO); 1486349cc55cSDimitry Andric return true; 1487349cc55cSDimitry Andric } 1488349cc55cSDimitry Andric 14890fca6ea1SDimitry Andric /// Try to convert "shuffle (castop), (castop)" with a shared castop operand 14900fca6ea1SDimitry Andric /// into "castop (shuffle)". 14910fca6ea1SDimitry Andric bool VectorCombine::foldShuffleOfCastops(Instruction &I) { 14920fca6ea1SDimitry Andric Value *V0, *V1; 14930fca6ea1SDimitry Andric ArrayRef<int> OldMask; 14940fca6ea1SDimitry Andric if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask)))) 14950fca6ea1SDimitry Andric return false; 14960fca6ea1SDimitry Andric 14970fca6ea1SDimitry Andric auto *C0 = dyn_cast<CastInst>(V0); 14980fca6ea1SDimitry Andric auto *C1 = dyn_cast<CastInst>(V1); 14990fca6ea1SDimitry Andric if (!C0 || !C1) 15000fca6ea1SDimitry Andric return false; 15010fca6ea1SDimitry Andric 15020fca6ea1SDimitry Andric Instruction::CastOps Opcode = C0->getOpcode(); 15030fca6ea1SDimitry Andric if (C0->getSrcTy() != C1->getSrcTy()) 15040fca6ea1SDimitry Andric return false; 15050fca6ea1SDimitry Andric 15060fca6ea1SDimitry Andric // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. 15070fca6ea1SDimitry Andric if (Opcode != C1->getOpcode()) { 15080fca6ea1SDimitry Andric if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) 15090fca6ea1SDimitry Andric Opcode = Instruction::SExt; 15100fca6ea1SDimitry Andric else 15110fca6ea1SDimitry Andric return false; 15120fca6ea1SDimitry Andric } 15130fca6ea1SDimitry Andric 15140fca6ea1SDimitry Andric auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 15150fca6ea1SDimitry Andric auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy()); 15160fca6ea1SDimitry Andric auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy()); 15170fca6ea1SDimitry Andric if (!ShuffleDstTy || !CastDstTy || !CastSrcTy) 15180fca6ea1SDimitry Andric return false; 15190fca6ea1SDimitry Andric 15200fca6ea1SDimitry Andric unsigned NumSrcElts = CastSrcTy->getNumElements(); 15210fca6ea1SDimitry Andric unsigned NumDstElts = CastDstTy->getNumElements(); 15220fca6ea1SDimitry Andric assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) && 15230fca6ea1SDimitry Andric "Only bitcasts expected to alter src/dst element counts"); 15240fca6ea1SDimitry Andric 15250fca6ea1SDimitry Andric // Check for bitcasting of unscalable vector types. 15260fca6ea1SDimitry Andric // e.g. <32 x i40> -> <40 x i32> 15270fca6ea1SDimitry Andric if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 && 15280fca6ea1SDimitry Andric (NumDstElts % NumSrcElts) != 0) 15290fca6ea1SDimitry Andric return false; 15300fca6ea1SDimitry Andric 15310fca6ea1SDimitry Andric SmallVector<int, 16> NewMask; 15320fca6ea1SDimitry Andric if (NumSrcElts >= NumDstElts) { 15330fca6ea1SDimitry Andric // The bitcast is from wide to narrow/equal elements. The shuffle mask can 15340fca6ea1SDimitry Andric // always be expanded to the equivalent form choosing narrower elements. 15350fca6ea1SDimitry Andric assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask"); 15360fca6ea1SDimitry Andric unsigned ScaleFactor = NumSrcElts / NumDstElts; 15370fca6ea1SDimitry Andric narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask); 15380fca6ea1SDimitry Andric } else { 15390fca6ea1SDimitry Andric // The bitcast is from narrow elements to wide elements. The shuffle mask 15400fca6ea1SDimitry Andric // must choose consecutive elements to allow casting first. 15410fca6ea1SDimitry Andric assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask"); 15420fca6ea1SDimitry Andric unsigned ScaleFactor = NumDstElts / NumSrcElts; 15430fca6ea1SDimitry Andric if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask)) 15440fca6ea1SDimitry Andric return false; 15450fca6ea1SDimitry Andric } 15460fca6ea1SDimitry Andric 15470fca6ea1SDimitry Andric auto *NewShuffleDstTy = 15480fca6ea1SDimitry Andric FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size()); 15490fca6ea1SDimitry Andric 15500fca6ea1SDimitry Andric // Try to replace a castop with a shuffle if the shuffle is not costly. 15510fca6ea1SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 15520fca6ea1SDimitry Andric 15530fca6ea1SDimitry Andric InstructionCost CostC0 = 15540fca6ea1SDimitry Andric TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, 15550fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind); 15560fca6ea1SDimitry Andric InstructionCost CostC1 = 15570fca6ea1SDimitry Andric TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, 15580fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind); 15590fca6ea1SDimitry Andric InstructionCost OldCost = CostC0 + CostC1; 15600fca6ea1SDimitry Andric OldCost += 15610fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy, 15620fca6ea1SDimitry Andric OldMask, CostKind, 0, nullptr, std::nullopt, &I); 15630fca6ea1SDimitry Andric 15640fca6ea1SDimitry Andric InstructionCost NewCost = TTI.getShuffleCost( 15650fca6ea1SDimitry Andric TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind); 15660fca6ea1SDimitry Andric NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, 15670fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind); 15680fca6ea1SDimitry Andric if (!C0->hasOneUse()) 15690fca6ea1SDimitry Andric NewCost += CostC0; 15700fca6ea1SDimitry Andric if (!C1->hasOneUse()) 15710fca6ea1SDimitry Andric NewCost += CostC1; 15720fca6ea1SDimitry Andric 15730fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I 15740fca6ea1SDimitry Andric << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 15750fca6ea1SDimitry Andric << "\n"); 15760fca6ea1SDimitry Andric if (NewCost > OldCost) 15770fca6ea1SDimitry Andric return false; 15780fca6ea1SDimitry Andric 15790fca6ea1SDimitry Andric Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0), 15800fca6ea1SDimitry Andric C1->getOperand(0), NewMask); 15810fca6ea1SDimitry Andric Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); 15820fca6ea1SDimitry Andric 15830fca6ea1SDimitry Andric // Intersect flags from the old casts. 15840fca6ea1SDimitry Andric if (auto *NewInst = dyn_cast<Instruction>(Cast)) { 15850fca6ea1SDimitry Andric NewInst->copyIRFlags(C0); 15860fca6ea1SDimitry Andric NewInst->andIRFlags(C1); 15870fca6ea1SDimitry Andric } 15880fca6ea1SDimitry Andric 15890fca6ea1SDimitry Andric Worklist.pushValue(Shuf); 15900fca6ea1SDimitry Andric replaceValue(I, *Cast); 15910fca6ea1SDimitry Andric return true; 15920fca6ea1SDimitry Andric } 15930fca6ea1SDimitry Andric 15940fca6ea1SDimitry Andric /// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)" 15950fca6ea1SDimitry Andric /// into "shuffle x, y". 15960fca6ea1SDimitry Andric bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { 15970fca6ea1SDimitry Andric Value *V0, *V1; 15980fca6ea1SDimitry Andric UndefValue *U0, *U1; 15990fca6ea1SDimitry Andric ArrayRef<int> OuterMask, InnerMask0, InnerMask1; 16000fca6ea1SDimitry Andric if (!match(&I, m_Shuffle(m_OneUse(m_Shuffle(m_Value(V0), m_UndefValue(U0), 16010fca6ea1SDimitry Andric m_Mask(InnerMask0))), 16020fca6ea1SDimitry Andric m_OneUse(m_Shuffle(m_Value(V1), m_UndefValue(U1), 16030fca6ea1SDimitry Andric m_Mask(InnerMask1))), 16040fca6ea1SDimitry Andric m_Mask(OuterMask)))) 16050fca6ea1SDimitry Andric return false; 16060fca6ea1SDimitry Andric 16070fca6ea1SDimitry Andric auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0)); 16080fca6ea1SDimitry Andric auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1)); 16090fca6ea1SDimitry Andric auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 16100fca6ea1SDimitry Andric auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType()); 16110fca6ea1SDimitry Andric auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType()); 16120fca6ea1SDimitry Andric if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy || 16130fca6ea1SDimitry Andric V0->getType() != V1->getType()) 16140fca6ea1SDimitry Andric return false; 16150fca6ea1SDimitry Andric 16160fca6ea1SDimitry Andric unsigned NumSrcElts = ShuffleSrcTy->getNumElements(); 16170fca6ea1SDimitry Andric unsigned NumImmElts = ShuffleImmTy->getNumElements(); 16180fca6ea1SDimitry Andric 16190fca6ea1SDimitry Andric // Bail if either inner masks reference a RHS undef arg. 16200fca6ea1SDimitry Andric if ((!isa<PoisonValue>(U0) && 16210fca6ea1SDimitry Andric any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) || 16220fca6ea1SDimitry Andric (!isa<PoisonValue>(U1) && 16230fca6ea1SDimitry Andric any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; }))) 16240fca6ea1SDimitry Andric return false; 16250fca6ea1SDimitry Andric 16260fca6ea1SDimitry Andric // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem, 16270fca6ea1SDimitry Andric SmallVector<int, 16> NewMask(OuterMask.begin(), OuterMask.end()); 16280fca6ea1SDimitry Andric for (int &M : NewMask) { 16290fca6ea1SDimitry Andric if (0 <= M && M < (int)NumImmElts) { 16300fca6ea1SDimitry Andric M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M]; 16310fca6ea1SDimitry Andric } else if (M >= (int)NumImmElts) { 16320fca6ea1SDimitry Andric if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts) 16330fca6ea1SDimitry Andric M = PoisonMaskElem; 16340fca6ea1SDimitry Andric else 16350fca6ea1SDimitry Andric M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts); 16360fca6ea1SDimitry Andric } 16370fca6ea1SDimitry Andric } 16380fca6ea1SDimitry Andric 16390fca6ea1SDimitry Andric // Have we folded to an Identity shuffle? 16400fca6ea1SDimitry Andric if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) { 16410fca6ea1SDimitry Andric replaceValue(I, *V0); 16420fca6ea1SDimitry Andric return true; 16430fca6ea1SDimitry Andric } 16440fca6ea1SDimitry Andric 16450fca6ea1SDimitry Andric // Try to merge the shuffles if the new shuffle is not costly. 16460fca6ea1SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 16470fca6ea1SDimitry Andric 16480fca6ea1SDimitry Andric InstructionCost OldCost = 16490fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, 16500fca6ea1SDimitry Andric InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) + 16510fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, 16520fca6ea1SDimitry Andric InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) + 16530fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, 16540fca6ea1SDimitry Andric OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I); 16550fca6ea1SDimitry Andric 16560fca6ea1SDimitry Andric InstructionCost NewCost = 16570fca6ea1SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, 16580fca6ea1SDimitry Andric NewMask, CostKind, 0, nullptr, {V0, V1}); 16590fca6ea1SDimitry Andric 16600fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I 16610fca6ea1SDimitry Andric << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 16620fca6ea1SDimitry Andric << "\n"); 16630fca6ea1SDimitry Andric if (NewCost > OldCost) 16640fca6ea1SDimitry Andric return false; 16650fca6ea1SDimitry Andric 16660fca6ea1SDimitry Andric // Clear unused sources to poison. 16670fca6ea1SDimitry Andric if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; })) 16680fca6ea1SDimitry Andric V0 = PoisonValue::get(ShuffleSrcTy); 16690fca6ea1SDimitry Andric if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; })) 16700fca6ea1SDimitry Andric V1 = PoisonValue::get(ShuffleSrcTy); 16710fca6ea1SDimitry Andric 16720fca6ea1SDimitry Andric Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask); 16730fca6ea1SDimitry Andric replaceValue(I, *Shuf); 16740fca6ea1SDimitry Andric return true; 16750fca6ea1SDimitry Andric } 16760fca6ea1SDimitry Andric 16770fca6ea1SDimitry Andric using InstLane = std::pair<Use *, int>; 16780fca6ea1SDimitry Andric 16790fca6ea1SDimitry Andric static InstLane lookThroughShuffles(Use *U, int Lane) { 16800fca6ea1SDimitry Andric while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) { 16810fca6ea1SDimitry Andric unsigned NumElts = 16820fca6ea1SDimitry Andric cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); 16830fca6ea1SDimitry Andric int M = SV->getMaskValue(Lane); 16840fca6ea1SDimitry Andric if (M < 0) 16850fca6ea1SDimitry Andric return {nullptr, PoisonMaskElem}; 16860fca6ea1SDimitry Andric if (static_cast<unsigned>(M) < NumElts) { 16870fca6ea1SDimitry Andric U = &SV->getOperandUse(0); 16880fca6ea1SDimitry Andric Lane = M; 16890fca6ea1SDimitry Andric } else { 16900fca6ea1SDimitry Andric U = &SV->getOperandUse(1); 16910fca6ea1SDimitry Andric Lane = M - NumElts; 16920fca6ea1SDimitry Andric } 16930fca6ea1SDimitry Andric } 16940fca6ea1SDimitry Andric return InstLane{U, Lane}; 16950fca6ea1SDimitry Andric } 16960fca6ea1SDimitry Andric 16970fca6ea1SDimitry Andric static SmallVector<InstLane> 16980fca6ea1SDimitry Andric generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { 16990fca6ea1SDimitry Andric SmallVector<InstLane> NItem; 17000fca6ea1SDimitry Andric for (InstLane IL : Item) { 17010fca6ea1SDimitry Andric auto [U, Lane] = IL; 17020fca6ea1SDimitry Andric InstLane OpLane = 17030fca6ea1SDimitry Andric U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op), 17040fca6ea1SDimitry Andric Lane) 17050fca6ea1SDimitry Andric : InstLane{nullptr, PoisonMaskElem}; 17060fca6ea1SDimitry Andric NItem.emplace_back(OpLane); 17070fca6ea1SDimitry Andric } 17080fca6ea1SDimitry Andric return NItem; 17090fca6ea1SDimitry Andric } 17100fca6ea1SDimitry Andric 17110fca6ea1SDimitry Andric /// Detect concat of multiple values into a vector 17120fca6ea1SDimitry Andric static bool isFreeConcat(ArrayRef<InstLane> Item, 17130fca6ea1SDimitry Andric const TargetTransformInfo &TTI) { 17140fca6ea1SDimitry Andric auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType()); 17150fca6ea1SDimitry Andric unsigned NumElts = Ty->getNumElements(); 17160fca6ea1SDimitry Andric if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0) 17170fca6ea1SDimitry Andric return false; 17180fca6ea1SDimitry Andric 17190fca6ea1SDimitry Andric // Check that the concat is free, usually meaning that the type will be split 17200fca6ea1SDimitry Andric // during legalization. 17210fca6ea1SDimitry Andric SmallVector<int, 16> ConcatMask(NumElts * 2); 17220fca6ea1SDimitry Andric std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 17230fca6ea1SDimitry Andric if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, 17240fca6ea1SDimitry Andric TTI::TCK_RecipThroughput) != 0) 17250fca6ea1SDimitry Andric return false; 17260fca6ea1SDimitry Andric 17270fca6ea1SDimitry Andric unsigned NumSlices = Item.size() / NumElts; 17280fca6ea1SDimitry Andric // Currently we generate a tree of shuffles for the concats, which limits us 17290fca6ea1SDimitry Andric // to a power2. 17300fca6ea1SDimitry Andric if (!isPowerOf2_32(NumSlices)) 17310fca6ea1SDimitry Andric return false; 17320fca6ea1SDimitry Andric for (unsigned Slice = 0; Slice < NumSlices; ++Slice) { 17330fca6ea1SDimitry Andric Use *SliceV = Item[Slice * NumElts].first; 17340fca6ea1SDimitry Andric if (!SliceV || SliceV->get()->getType() != Ty) 17350fca6ea1SDimitry Andric return false; 17360fca6ea1SDimitry Andric for (unsigned Elt = 0; Elt < NumElts; ++Elt) { 17370fca6ea1SDimitry Andric auto [V, Lane] = Item[Slice * NumElts + Elt]; 17380fca6ea1SDimitry Andric if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get()) 17390fca6ea1SDimitry Andric return false; 17400fca6ea1SDimitry Andric } 17410fca6ea1SDimitry Andric } 17420fca6ea1SDimitry Andric return true; 17430fca6ea1SDimitry Andric } 17440fca6ea1SDimitry Andric 17450fca6ea1SDimitry Andric static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, 17460fca6ea1SDimitry Andric const SmallPtrSet<Use *, 4> &IdentityLeafs, 17470fca6ea1SDimitry Andric const SmallPtrSet<Use *, 4> &SplatLeafs, 17480fca6ea1SDimitry Andric const SmallPtrSet<Use *, 4> &ConcatLeafs, 17490fca6ea1SDimitry Andric IRBuilder<> &Builder) { 17500fca6ea1SDimitry Andric auto [FrontU, FrontLane] = Item.front(); 17510fca6ea1SDimitry Andric 17520fca6ea1SDimitry Andric if (IdentityLeafs.contains(FrontU)) { 17530fca6ea1SDimitry Andric return FrontU->get(); 17540fca6ea1SDimitry Andric } 17550fca6ea1SDimitry Andric if (SplatLeafs.contains(FrontU)) { 17560fca6ea1SDimitry Andric SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane); 17570fca6ea1SDimitry Andric return Builder.CreateShuffleVector(FrontU->get(), Mask); 17580fca6ea1SDimitry Andric } 17590fca6ea1SDimitry Andric if (ConcatLeafs.contains(FrontU)) { 17600fca6ea1SDimitry Andric unsigned NumElts = 17610fca6ea1SDimitry Andric cast<FixedVectorType>(FrontU->get()->getType())->getNumElements(); 17620fca6ea1SDimitry Andric SmallVector<Value *> Values(Item.size() / NumElts, nullptr); 17630fca6ea1SDimitry Andric for (unsigned S = 0; S < Values.size(); ++S) 17640fca6ea1SDimitry Andric Values[S] = Item[S * NumElts].first->get(); 17650fca6ea1SDimitry Andric 17660fca6ea1SDimitry Andric while (Values.size() > 1) { 17670fca6ea1SDimitry Andric NumElts *= 2; 17680fca6ea1SDimitry Andric SmallVector<int, 16> Mask(NumElts, 0); 17690fca6ea1SDimitry Andric std::iota(Mask.begin(), Mask.end(), 0); 17700fca6ea1SDimitry Andric SmallVector<Value *> NewValues(Values.size() / 2, nullptr); 17710fca6ea1SDimitry Andric for (unsigned S = 0; S < NewValues.size(); ++S) 17720fca6ea1SDimitry Andric NewValues[S] = 17730fca6ea1SDimitry Andric Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask); 17740fca6ea1SDimitry Andric Values = NewValues; 17750fca6ea1SDimitry Andric } 17760fca6ea1SDimitry Andric return Values[0]; 17770fca6ea1SDimitry Andric } 17780fca6ea1SDimitry Andric 17790fca6ea1SDimitry Andric auto *I = cast<Instruction>(FrontU->get()); 17800fca6ea1SDimitry Andric auto *II = dyn_cast<IntrinsicInst>(I); 17810fca6ea1SDimitry Andric unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); 17820fca6ea1SDimitry Andric SmallVector<Value *> Ops(NumOps); 17830fca6ea1SDimitry Andric for (unsigned Idx = 0; Idx < NumOps; Idx++) { 17840fca6ea1SDimitry Andric if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) { 17850fca6ea1SDimitry Andric Ops[Idx] = II->getOperand(Idx); 17860fca6ea1SDimitry Andric continue; 17870fca6ea1SDimitry Andric } 17880fca6ea1SDimitry Andric Ops[Idx] = 17890fca6ea1SDimitry Andric generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty, 17900fca6ea1SDimitry Andric IdentityLeafs, SplatLeafs, ConcatLeafs, Builder); 17910fca6ea1SDimitry Andric } 17920fca6ea1SDimitry Andric 17930fca6ea1SDimitry Andric SmallVector<Value *, 8> ValueList; 17940fca6ea1SDimitry Andric for (const auto &Lane : Item) 17950fca6ea1SDimitry Andric if (Lane.first) 17960fca6ea1SDimitry Andric ValueList.push_back(Lane.first->get()); 17970fca6ea1SDimitry Andric 17980fca6ea1SDimitry Andric Type *DstTy = 17990fca6ea1SDimitry Andric FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements()); 18000fca6ea1SDimitry Andric if (auto *BI = dyn_cast<BinaryOperator>(I)) { 18010fca6ea1SDimitry Andric auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), 18020fca6ea1SDimitry Andric Ops[0], Ops[1]); 18030fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18040fca6ea1SDimitry Andric return Value; 18050fca6ea1SDimitry Andric } 18060fca6ea1SDimitry Andric if (auto *CI = dyn_cast<CmpInst>(I)) { 18070fca6ea1SDimitry Andric auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]); 18080fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18090fca6ea1SDimitry Andric return Value; 18100fca6ea1SDimitry Andric } 18110fca6ea1SDimitry Andric if (auto *SI = dyn_cast<SelectInst>(I)) { 18120fca6ea1SDimitry Andric auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI); 18130fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18140fca6ea1SDimitry Andric return Value; 18150fca6ea1SDimitry Andric } 18160fca6ea1SDimitry Andric if (auto *CI = dyn_cast<CastInst>(I)) { 18170fca6ea1SDimitry Andric auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), 18180fca6ea1SDimitry Andric Ops[0], DstTy); 18190fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18200fca6ea1SDimitry Andric return Value; 18210fca6ea1SDimitry Andric } 18220fca6ea1SDimitry Andric if (II) { 18230fca6ea1SDimitry Andric auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops); 18240fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18250fca6ea1SDimitry Andric return Value; 18260fca6ea1SDimitry Andric } 18270fca6ea1SDimitry Andric assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate"); 18280fca6ea1SDimitry Andric auto *Value = 18290fca6ea1SDimitry Andric Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]); 18300fca6ea1SDimitry Andric propagateIRFlags(Value, ValueList); 18310fca6ea1SDimitry Andric return Value; 18320fca6ea1SDimitry Andric } 18330fca6ea1SDimitry Andric 18340fca6ea1SDimitry Andric // Starting from a shuffle, look up through operands tracking the shuffled index 18350fca6ea1SDimitry Andric // of each lane. If we can simplify away the shuffles to identities then 18360fca6ea1SDimitry Andric // do so. 18370fca6ea1SDimitry Andric bool VectorCombine::foldShuffleToIdentity(Instruction &I) { 18380fca6ea1SDimitry Andric auto *Ty = dyn_cast<FixedVectorType>(I.getType()); 18390fca6ea1SDimitry Andric if (!Ty || I.use_empty()) 18400fca6ea1SDimitry Andric return false; 18410fca6ea1SDimitry Andric 18420fca6ea1SDimitry Andric SmallVector<InstLane> Start(Ty->getNumElements()); 18430fca6ea1SDimitry Andric for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M) 18440fca6ea1SDimitry Andric Start[M] = lookThroughShuffles(&*I.use_begin(), M); 18450fca6ea1SDimitry Andric 18460fca6ea1SDimitry Andric SmallVector<SmallVector<InstLane>> Worklist; 18470fca6ea1SDimitry Andric Worklist.push_back(Start); 18480fca6ea1SDimitry Andric SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs; 18490fca6ea1SDimitry Andric unsigned NumVisited = 0; 18500fca6ea1SDimitry Andric 18510fca6ea1SDimitry Andric while (!Worklist.empty()) { 18520fca6ea1SDimitry Andric if (++NumVisited > MaxInstrsToScan) 18530fca6ea1SDimitry Andric return false; 18540fca6ea1SDimitry Andric 18550fca6ea1SDimitry Andric SmallVector<InstLane> Item = Worklist.pop_back_val(); 18560fca6ea1SDimitry Andric auto [FrontU, FrontLane] = Item.front(); 18570fca6ea1SDimitry Andric 18580fca6ea1SDimitry Andric // If we found an undef first lane then bail out to keep things simple. 18590fca6ea1SDimitry Andric if (!FrontU) 18600fca6ea1SDimitry Andric return false; 18610fca6ea1SDimitry Andric 18620fca6ea1SDimitry Andric // Helper to peek through bitcasts to the same value. 18630fca6ea1SDimitry Andric auto IsEquiv = [&](Value *X, Value *Y) { 18640fca6ea1SDimitry Andric return X->getType() == Y->getType() && 18650fca6ea1SDimitry Andric peekThroughBitcasts(X) == peekThroughBitcasts(Y); 18660fca6ea1SDimitry Andric }; 18670fca6ea1SDimitry Andric 18680fca6ea1SDimitry Andric // Look for an identity value. 18690fca6ea1SDimitry Andric if (FrontLane == 0 && 18700fca6ea1SDimitry Andric cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() == 18710fca6ea1SDimitry Andric Ty->getNumElements() && 18720fca6ea1SDimitry Andric all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) { 18730fca6ea1SDimitry Andric Value *FrontV = Item.front().first->get(); 18740fca6ea1SDimitry Andric return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) && 18750fca6ea1SDimitry Andric E.value().second == (int)E.index()); 18760fca6ea1SDimitry Andric })) { 18770fca6ea1SDimitry Andric IdentityLeafs.insert(FrontU); 18780fca6ea1SDimitry Andric continue; 18790fca6ea1SDimitry Andric } 18800fca6ea1SDimitry Andric // Look for constants, for the moment only supporting constant splats. 18810fca6ea1SDimitry Andric if (auto *C = dyn_cast<Constant>(FrontU); 18820fca6ea1SDimitry Andric C && C->getSplatValue() && 18830fca6ea1SDimitry Andric all_of(drop_begin(Item), [Item](InstLane &IL) { 18840fca6ea1SDimitry Andric Value *FrontV = Item.front().first->get(); 18850fca6ea1SDimitry Andric Use *U = IL.first; 18860fca6ea1SDimitry Andric return !U || U->get() == FrontV; 18870fca6ea1SDimitry Andric })) { 18880fca6ea1SDimitry Andric SplatLeafs.insert(FrontU); 18890fca6ea1SDimitry Andric continue; 18900fca6ea1SDimitry Andric } 18910fca6ea1SDimitry Andric // Look for a splat value. 18920fca6ea1SDimitry Andric if (all_of(drop_begin(Item), [Item](InstLane &IL) { 18930fca6ea1SDimitry Andric auto [FrontU, FrontLane] = Item.front(); 18940fca6ea1SDimitry Andric auto [U, Lane] = IL; 18950fca6ea1SDimitry Andric return !U || (U->get() == FrontU->get() && Lane == FrontLane); 18960fca6ea1SDimitry Andric })) { 18970fca6ea1SDimitry Andric SplatLeafs.insert(FrontU); 18980fca6ea1SDimitry Andric continue; 18990fca6ea1SDimitry Andric } 19000fca6ea1SDimitry Andric 19010fca6ea1SDimitry Andric // We need each element to be the same type of value, and check that each 19020fca6ea1SDimitry Andric // element has a single use. 1903*5deeebd8SDimitry Andric auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) { 19040fca6ea1SDimitry Andric Value *FrontV = Item.front().first->get(); 19050fca6ea1SDimitry Andric if (!IL.first) 19060fca6ea1SDimitry Andric return true; 19070fca6ea1SDimitry Andric Value *V = IL.first->get(); 19080fca6ea1SDimitry Andric if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse()) 19090fca6ea1SDimitry Andric return false; 19100fca6ea1SDimitry Andric if (V->getValueID() != FrontV->getValueID()) 19110fca6ea1SDimitry Andric return false; 19120fca6ea1SDimitry Andric if (auto *CI = dyn_cast<CmpInst>(V)) 19130fca6ea1SDimitry Andric if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate()) 19140fca6ea1SDimitry Andric return false; 19150fca6ea1SDimitry Andric if (auto *CI = dyn_cast<CastInst>(V)) 19160fca6ea1SDimitry Andric if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy()) 19170fca6ea1SDimitry Andric return false; 19180fca6ea1SDimitry Andric if (auto *SI = dyn_cast<SelectInst>(V)) 19190fca6ea1SDimitry Andric if (!isa<VectorType>(SI->getOperand(0)->getType()) || 19200fca6ea1SDimitry Andric SI->getOperand(0)->getType() != 19210fca6ea1SDimitry Andric cast<SelectInst>(FrontV)->getOperand(0)->getType()) 19220fca6ea1SDimitry Andric return false; 19230fca6ea1SDimitry Andric if (isa<CallInst>(V) && !isa<IntrinsicInst>(V)) 19240fca6ea1SDimitry Andric return false; 19250fca6ea1SDimitry Andric auto *II = dyn_cast<IntrinsicInst>(V); 19260fca6ea1SDimitry Andric return !II || (isa<IntrinsicInst>(FrontV) && 19270fca6ea1SDimitry Andric II->getIntrinsicID() == 1928*5deeebd8SDimitry Andric cast<IntrinsicInst>(FrontV)->getIntrinsicID() && 1929*5deeebd8SDimitry Andric !II->hasOperandBundles()); 1930*5deeebd8SDimitry Andric }; 1931*5deeebd8SDimitry Andric if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) { 19320fca6ea1SDimitry Andric // Check the operator is one that we support. 19330fca6ea1SDimitry Andric if (isa<BinaryOperator, CmpInst>(FrontU)) { 19340fca6ea1SDimitry Andric // We exclude div/rem in case they hit UB from poison lanes. 19350fca6ea1SDimitry Andric if (auto *BO = dyn_cast<BinaryOperator>(FrontU); 19360fca6ea1SDimitry Andric BO && BO->isIntDivRem()) 19370fca6ea1SDimitry Andric return false; 19380fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 19390fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 19400fca6ea1SDimitry Andric continue; 19410fca6ea1SDimitry Andric } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) { 19420fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 19430fca6ea1SDimitry Andric continue; 19440fca6ea1SDimitry Andric } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) { 19450fca6ea1SDimitry Andric // TODO: Handle vector widening/narrowing bitcasts. 19460fca6ea1SDimitry Andric auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy()); 19470fca6ea1SDimitry Andric auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy()); 19480fca6ea1SDimitry Andric if (DstTy && SrcTy && 19490fca6ea1SDimitry Andric SrcTy->getNumElements() == DstTy->getNumElements()) { 19500fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 19510fca6ea1SDimitry Andric continue; 19520fca6ea1SDimitry Andric } 19530fca6ea1SDimitry Andric } else if (isa<SelectInst>(FrontU)) { 19540fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 19550fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 19560fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); 19570fca6ea1SDimitry Andric continue; 19580fca6ea1SDimitry Andric } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU); 1959*5deeebd8SDimitry Andric II && isTriviallyVectorizable(II->getIntrinsicID()) && 1960*5deeebd8SDimitry Andric !II->hasOperandBundles()) { 19610fca6ea1SDimitry Andric for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { 19620fca6ea1SDimitry Andric if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { 19630fca6ea1SDimitry Andric if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { 19640fca6ea1SDimitry Andric Value *FrontV = Item.front().first->get(); 19650fca6ea1SDimitry Andric Use *U = IL.first; 19660fca6ea1SDimitry Andric return !U || (cast<Instruction>(U->get())->getOperand(Op) == 19670fca6ea1SDimitry Andric cast<Instruction>(FrontV)->getOperand(Op)); 19680fca6ea1SDimitry Andric })) 19690fca6ea1SDimitry Andric return false; 19700fca6ea1SDimitry Andric continue; 19710fca6ea1SDimitry Andric } 19720fca6ea1SDimitry Andric Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op)); 19730fca6ea1SDimitry Andric } 19740fca6ea1SDimitry Andric continue; 19750fca6ea1SDimitry Andric } 19760fca6ea1SDimitry Andric } 19770fca6ea1SDimitry Andric 19780fca6ea1SDimitry Andric if (isFreeConcat(Item, TTI)) { 19790fca6ea1SDimitry Andric ConcatLeafs.insert(FrontU); 19800fca6ea1SDimitry Andric continue; 19810fca6ea1SDimitry Andric } 19820fca6ea1SDimitry Andric 19830fca6ea1SDimitry Andric return false; 19840fca6ea1SDimitry Andric } 19850fca6ea1SDimitry Andric 19860fca6ea1SDimitry Andric if (NumVisited <= 1) 19870fca6ea1SDimitry Andric return false; 19880fca6ea1SDimitry Andric 19890fca6ea1SDimitry Andric // If we got this far, we know the shuffles are superfluous and can be 19900fca6ea1SDimitry Andric // removed. Scan through again and generate the new tree of instructions. 19910fca6ea1SDimitry Andric Builder.SetInsertPoint(&I); 19920fca6ea1SDimitry Andric Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, 19930fca6ea1SDimitry Andric ConcatLeafs, Builder); 19940fca6ea1SDimitry Andric replaceValue(I, *V); 19950fca6ea1SDimitry Andric return true; 19960fca6ea1SDimitry Andric } 19970fca6ea1SDimitry Andric 199881ad6265SDimitry Andric /// Given a commutative reduction, the order of the input lanes does not alter 199981ad6265SDimitry Andric /// the results. We can use this to remove certain shuffles feeding the 200081ad6265SDimitry Andric /// reduction, removing the need to shuffle at all. 200181ad6265SDimitry Andric bool VectorCombine::foldShuffleFromReductions(Instruction &I) { 200281ad6265SDimitry Andric auto *II = dyn_cast<IntrinsicInst>(&I); 200381ad6265SDimitry Andric if (!II) 200481ad6265SDimitry Andric return false; 200581ad6265SDimitry Andric switch (II->getIntrinsicID()) { 200681ad6265SDimitry Andric case Intrinsic::vector_reduce_add: 200781ad6265SDimitry Andric case Intrinsic::vector_reduce_mul: 200881ad6265SDimitry Andric case Intrinsic::vector_reduce_and: 200981ad6265SDimitry Andric case Intrinsic::vector_reduce_or: 201081ad6265SDimitry Andric case Intrinsic::vector_reduce_xor: 201181ad6265SDimitry Andric case Intrinsic::vector_reduce_smin: 201281ad6265SDimitry Andric case Intrinsic::vector_reduce_smax: 201381ad6265SDimitry Andric case Intrinsic::vector_reduce_umin: 201481ad6265SDimitry Andric case Intrinsic::vector_reduce_umax: 201581ad6265SDimitry Andric break; 201681ad6265SDimitry Andric default: 201781ad6265SDimitry Andric return false; 201881ad6265SDimitry Andric } 201981ad6265SDimitry Andric 202081ad6265SDimitry Andric // Find all the inputs when looking through operations that do not alter the 202181ad6265SDimitry Andric // lane order (binops, for example). Currently we look for a single shuffle, 202281ad6265SDimitry Andric // and can ignore splat values. 202381ad6265SDimitry Andric std::queue<Value *> Worklist; 202481ad6265SDimitry Andric SmallPtrSet<Value *, 4> Visited; 202581ad6265SDimitry Andric ShuffleVectorInst *Shuffle = nullptr; 202681ad6265SDimitry Andric if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) 202781ad6265SDimitry Andric Worklist.push(Op); 202881ad6265SDimitry Andric 202981ad6265SDimitry Andric while (!Worklist.empty()) { 203081ad6265SDimitry Andric Value *CV = Worklist.front(); 203181ad6265SDimitry Andric Worklist.pop(); 203281ad6265SDimitry Andric if (Visited.contains(CV)) 203381ad6265SDimitry Andric continue; 203481ad6265SDimitry Andric 203581ad6265SDimitry Andric // Splats don't change the order, so can be safely ignored. 203681ad6265SDimitry Andric if (isSplatValue(CV)) 203781ad6265SDimitry Andric continue; 203881ad6265SDimitry Andric 203981ad6265SDimitry Andric Visited.insert(CV); 204081ad6265SDimitry Andric 204181ad6265SDimitry Andric if (auto *CI = dyn_cast<Instruction>(CV)) { 204281ad6265SDimitry Andric if (CI->isBinaryOp()) { 204381ad6265SDimitry Andric for (auto *Op : CI->operand_values()) 204481ad6265SDimitry Andric Worklist.push(Op); 204581ad6265SDimitry Andric continue; 204681ad6265SDimitry Andric } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { 204781ad6265SDimitry Andric if (Shuffle && Shuffle != SV) 204881ad6265SDimitry Andric return false; 204981ad6265SDimitry Andric Shuffle = SV; 205081ad6265SDimitry Andric continue; 205181ad6265SDimitry Andric } 205281ad6265SDimitry Andric } 205381ad6265SDimitry Andric 205481ad6265SDimitry Andric // Anything else is currently an unknown node. 205581ad6265SDimitry Andric return false; 205681ad6265SDimitry Andric } 205781ad6265SDimitry Andric 205881ad6265SDimitry Andric if (!Shuffle) 205981ad6265SDimitry Andric return false; 206081ad6265SDimitry Andric 206181ad6265SDimitry Andric // Check all uses of the binary ops and shuffles are also included in the 206281ad6265SDimitry Andric // lane-invariant operations (Visited should be the list of lanewise 206381ad6265SDimitry Andric // instructions, including the shuffle that we found). 206481ad6265SDimitry Andric for (auto *V : Visited) 206581ad6265SDimitry Andric for (auto *U : V->users()) 206681ad6265SDimitry Andric if (!Visited.contains(U) && U != &I) 206781ad6265SDimitry Andric return false; 206881ad6265SDimitry Andric 206981ad6265SDimitry Andric FixedVectorType *VecType = 207081ad6265SDimitry Andric dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); 207181ad6265SDimitry Andric if (!VecType) 207281ad6265SDimitry Andric return false; 207381ad6265SDimitry Andric FixedVectorType *ShuffleInputType = 207481ad6265SDimitry Andric dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); 207581ad6265SDimitry Andric if (!ShuffleInputType) 207681ad6265SDimitry Andric return false; 20775f757f3fSDimitry Andric unsigned NumInputElts = ShuffleInputType->getNumElements(); 207881ad6265SDimitry Andric 207981ad6265SDimitry Andric // Find the mask from sorting the lanes into order. This is most likely to 208081ad6265SDimitry Andric // become a identity or concat mask. Undef elements are pushed to the end. 208181ad6265SDimitry Andric SmallVector<int> ConcatMask; 208281ad6265SDimitry Andric Shuffle->getShuffleMask(ConcatMask); 208381ad6265SDimitry Andric sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); 20845f757f3fSDimitry Andric // In the case of a truncating shuffle it's possible for the mask 20855f757f3fSDimitry Andric // to have an index greater than the size of the resulting vector. 20865f757f3fSDimitry Andric // This requires special handling. 20875f757f3fSDimitry Andric bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; 208881ad6265SDimitry Andric bool UsesSecondVec = 20895f757f3fSDimitry Andric any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); 20905f757f3fSDimitry Andric 20915f757f3fSDimitry Andric FixedVectorType *VecTyForCost = 20925f757f3fSDimitry Andric (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; 209381ad6265SDimitry Andric InstructionCost OldCost = TTI.getShuffleCost( 20945f757f3fSDimitry Andric UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 20955f757f3fSDimitry Andric VecTyForCost, Shuffle->getShuffleMask()); 209681ad6265SDimitry Andric InstructionCost NewCost = TTI.getShuffleCost( 20975f757f3fSDimitry Andric UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 20985f757f3fSDimitry Andric VecTyForCost, ConcatMask); 209981ad6265SDimitry Andric 210081ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle 210181ad6265SDimitry Andric << "\n"); 210281ad6265SDimitry Andric LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost 210381ad6265SDimitry Andric << "\n"); 210481ad6265SDimitry Andric if (NewCost < OldCost) { 210581ad6265SDimitry Andric Builder.SetInsertPoint(Shuffle); 210681ad6265SDimitry Andric Value *NewShuffle = Builder.CreateShuffleVector( 210781ad6265SDimitry Andric Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); 210881ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); 210981ad6265SDimitry Andric replaceValue(*Shuffle, *NewShuffle); 211081ad6265SDimitry Andric } 211181ad6265SDimitry Andric 211281ad6265SDimitry Andric // See if we can re-use foldSelectShuffle, getting it to reduce the size of 211381ad6265SDimitry Andric // the shuffle into a nicer order, as it can ignore the order of the shuffles. 211481ad6265SDimitry Andric return foldSelectShuffle(*Shuffle, true); 211581ad6265SDimitry Andric } 211681ad6265SDimitry Andric 21170fca6ea1SDimitry Andric /// Determine if its more efficient to fold: 21180fca6ea1SDimitry Andric /// reduce(trunc(x)) -> trunc(reduce(x)). 21190fca6ea1SDimitry Andric /// reduce(sext(x)) -> sext(reduce(x)). 21200fca6ea1SDimitry Andric /// reduce(zext(x)) -> zext(reduce(x)). 21210fca6ea1SDimitry Andric bool VectorCombine::foldCastFromReductions(Instruction &I) { 21220fca6ea1SDimitry Andric auto *II = dyn_cast<IntrinsicInst>(&I); 21230fca6ea1SDimitry Andric if (!II) 21240fca6ea1SDimitry Andric return false; 21250fca6ea1SDimitry Andric 21260fca6ea1SDimitry Andric bool TruncOnly = false; 21270fca6ea1SDimitry Andric Intrinsic::ID IID = II->getIntrinsicID(); 21280fca6ea1SDimitry Andric switch (IID) { 21290fca6ea1SDimitry Andric case Intrinsic::vector_reduce_add: 21300fca6ea1SDimitry Andric case Intrinsic::vector_reduce_mul: 21310fca6ea1SDimitry Andric TruncOnly = true; 21320fca6ea1SDimitry Andric break; 21330fca6ea1SDimitry Andric case Intrinsic::vector_reduce_and: 21340fca6ea1SDimitry Andric case Intrinsic::vector_reduce_or: 21350fca6ea1SDimitry Andric case Intrinsic::vector_reduce_xor: 21360fca6ea1SDimitry Andric break; 21370fca6ea1SDimitry Andric default: 21380fca6ea1SDimitry Andric return false; 21390fca6ea1SDimitry Andric } 21400fca6ea1SDimitry Andric 21410fca6ea1SDimitry Andric unsigned ReductionOpc = getArithmeticReductionInstruction(IID); 21420fca6ea1SDimitry Andric Value *ReductionSrc = I.getOperand(0); 21430fca6ea1SDimitry Andric 21440fca6ea1SDimitry Andric Value *Src; 21450fca6ea1SDimitry Andric if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) && 21460fca6ea1SDimitry Andric (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src)))))) 21470fca6ea1SDimitry Andric return false; 21480fca6ea1SDimitry Andric 21490fca6ea1SDimitry Andric auto CastOpc = 21500fca6ea1SDimitry Andric (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode(); 21510fca6ea1SDimitry Andric 21520fca6ea1SDimitry Andric auto *SrcTy = cast<VectorType>(Src->getType()); 21530fca6ea1SDimitry Andric auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType()); 21540fca6ea1SDimitry Andric Type *ResultTy = I.getType(); 21550fca6ea1SDimitry Andric 21560fca6ea1SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 21570fca6ea1SDimitry Andric InstructionCost OldCost = TTI.getArithmeticReductionCost( 21580fca6ea1SDimitry Andric ReductionOpc, ReductionSrcTy, std::nullopt, CostKind); 21590fca6ea1SDimitry Andric OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy, 21600fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind, 21610fca6ea1SDimitry Andric cast<CastInst>(ReductionSrc)); 21620fca6ea1SDimitry Andric InstructionCost NewCost = 21630fca6ea1SDimitry Andric TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt, 21640fca6ea1SDimitry Andric CostKind) + 21650fca6ea1SDimitry Andric TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(), 21660fca6ea1SDimitry Andric TTI::CastContextHint::None, CostKind); 21670fca6ea1SDimitry Andric 21680fca6ea1SDimitry Andric if (OldCost <= NewCost || !NewCost.isValid()) 21690fca6ea1SDimitry Andric return false; 21700fca6ea1SDimitry Andric 21710fca6ea1SDimitry Andric Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(), 21720fca6ea1SDimitry Andric II->getIntrinsicID(), {Src}); 21730fca6ea1SDimitry Andric Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy); 21740fca6ea1SDimitry Andric replaceValue(I, *NewCast); 21750fca6ea1SDimitry Andric return true; 21760fca6ea1SDimitry Andric } 21770fca6ea1SDimitry Andric 217881ad6265SDimitry Andric /// This method looks for groups of shuffles acting on binops, of the form: 217981ad6265SDimitry Andric /// %x = shuffle ... 218081ad6265SDimitry Andric /// %y = shuffle ... 218181ad6265SDimitry Andric /// %a = binop %x, %y 218281ad6265SDimitry Andric /// %b = binop %x, %y 218381ad6265SDimitry Andric /// shuffle %a, %b, selectmask 218481ad6265SDimitry Andric /// We may, especially if the shuffle is wider than legal, be able to convert 218581ad6265SDimitry Andric /// the shuffle to a form where only parts of a and b need to be computed. On 218681ad6265SDimitry Andric /// architectures with no obvious "select" shuffle, this can reduce the total 218781ad6265SDimitry Andric /// number of operations if the target reports them as cheaper. 218881ad6265SDimitry Andric bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { 2189bdd1243dSDimitry Andric auto *SVI = cast<ShuffleVectorInst>(&I); 2190bdd1243dSDimitry Andric auto *VT = cast<FixedVectorType>(I.getType()); 219181ad6265SDimitry Andric auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); 219281ad6265SDimitry Andric auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); 219381ad6265SDimitry Andric if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || 219481ad6265SDimitry Andric VT != Op0->getType()) 219581ad6265SDimitry Andric return false; 2196bdd1243dSDimitry Andric 2197753f127fSDimitry Andric auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); 2198753f127fSDimitry Andric auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); 2199753f127fSDimitry Andric auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); 2200753f127fSDimitry Andric auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); 2201753f127fSDimitry Andric SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); 220281ad6265SDimitry Andric auto checkSVNonOpUses = [&](Instruction *I) { 220381ad6265SDimitry Andric if (!I || I->getOperand(0)->getType() != VT) 220481ad6265SDimitry Andric return true; 2205753f127fSDimitry Andric return any_of(I->users(), [&](User *U) { 2206753f127fSDimitry Andric return U != Op0 && U != Op1 && 2207753f127fSDimitry Andric !(isa<ShuffleVectorInst>(U) && 2208753f127fSDimitry Andric (InputShuffles.contains(cast<Instruction>(U)) || 2209753f127fSDimitry Andric isInstructionTriviallyDead(cast<Instruction>(U)))); 2210753f127fSDimitry Andric }); 221181ad6265SDimitry Andric }; 221281ad6265SDimitry Andric if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || 221381ad6265SDimitry Andric checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) 221481ad6265SDimitry Andric return false; 221581ad6265SDimitry Andric 221681ad6265SDimitry Andric // Collect all the uses that are shuffles that we can transform together. We 221781ad6265SDimitry Andric // may not have a single shuffle, but a group that can all be transformed 221881ad6265SDimitry Andric // together profitably. 221981ad6265SDimitry Andric SmallVector<ShuffleVectorInst *> Shuffles; 222081ad6265SDimitry Andric auto collectShuffles = [&](Instruction *I) { 222181ad6265SDimitry Andric for (auto *U : I->users()) { 222281ad6265SDimitry Andric auto *SV = dyn_cast<ShuffleVectorInst>(U); 222381ad6265SDimitry Andric if (!SV || SV->getType() != VT) 222481ad6265SDimitry Andric return false; 2225753f127fSDimitry Andric if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || 2226753f127fSDimitry Andric (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) 2227753f127fSDimitry Andric return false; 222881ad6265SDimitry Andric if (!llvm::is_contained(Shuffles, SV)) 222981ad6265SDimitry Andric Shuffles.push_back(SV); 223081ad6265SDimitry Andric } 223181ad6265SDimitry Andric return true; 223281ad6265SDimitry Andric }; 223381ad6265SDimitry Andric if (!collectShuffles(Op0) || !collectShuffles(Op1)) 223481ad6265SDimitry Andric return false; 223581ad6265SDimitry Andric // From a reduction, we need to be processing a single shuffle, otherwise the 223681ad6265SDimitry Andric // other uses will not be lane-invariant. 223781ad6265SDimitry Andric if (FromReduction && Shuffles.size() > 1) 223881ad6265SDimitry Andric return false; 223981ad6265SDimitry Andric 2240753f127fSDimitry Andric // Add any shuffle uses for the shuffles we have found, to include them in our 2241753f127fSDimitry Andric // cost calculations. 2242753f127fSDimitry Andric if (!FromReduction) { 2243753f127fSDimitry Andric for (ShuffleVectorInst *SV : Shuffles) { 2244bdd1243dSDimitry Andric for (auto *U : SV->users()) { 2245753f127fSDimitry Andric ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); 2246fcaf7f86SDimitry Andric if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) 2247753f127fSDimitry Andric Shuffles.push_back(SSV); 2248753f127fSDimitry Andric } 2249753f127fSDimitry Andric } 2250753f127fSDimitry Andric } 2251753f127fSDimitry Andric 225281ad6265SDimitry Andric // For each of the output shuffles, we try to sort all the first vector 225381ad6265SDimitry Andric // elements to the beginning, followed by the second array elements at the 225481ad6265SDimitry Andric // end. If the binops are legalized to smaller vectors, this may reduce total 225581ad6265SDimitry Andric // number of binops. We compute the ReconstructMask mask needed to convert 225681ad6265SDimitry Andric // back to the original lane order. 2257753f127fSDimitry Andric SmallVector<std::pair<int, int>> V1, V2; 2258753f127fSDimitry Andric SmallVector<SmallVector<int>> OrigReconstructMasks; 225981ad6265SDimitry Andric int MaxV1Elt = 0, MaxV2Elt = 0; 226081ad6265SDimitry Andric unsigned NumElts = VT->getNumElements(); 226181ad6265SDimitry Andric for (ShuffleVectorInst *SVN : Shuffles) { 226281ad6265SDimitry Andric SmallVector<int> Mask; 226381ad6265SDimitry Andric SVN->getShuffleMask(Mask); 226481ad6265SDimitry Andric 226581ad6265SDimitry Andric // Check the operands are the same as the original, or reversed (in which 226681ad6265SDimitry Andric // case we need to commute the mask). 226781ad6265SDimitry Andric Value *SVOp0 = SVN->getOperand(0); 226881ad6265SDimitry Andric Value *SVOp1 = SVN->getOperand(1); 2269753f127fSDimitry Andric if (isa<UndefValue>(SVOp1)) { 2270753f127fSDimitry Andric auto *SSV = cast<ShuffleVectorInst>(SVOp0); 2271753f127fSDimitry Andric SVOp0 = SSV->getOperand(0); 2272753f127fSDimitry Andric SVOp1 = SSV->getOperand(1); 2273753f127fSDimitry Andric for (unsigned I = 0, E = Mask.size(); I != E; I++) { 2274753f127fSDimitry Andric if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) 2275753f127fSDimitry Andric return false; 2276753f127fSDimitry Andric Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); 2277753f127fSDimitry Andric } 2278753f127fSDimitry Andric } 227981ad6265SDimitry Andric if (SVOp0 == Op1 && SVOp1 == Op0) { 228081ad6265SDimitry Andric std::swap(SVOp0, SVOp1); 228181ad6265SDimitry Andric ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 228281ad6265SDimitry Andric } 228381ad6265SDimitry Andric if (SVOp0 != Op0 || SVOp1 != Op1) 228481ad6265SDimitry Andric return false; 228581ad6265SDimitry Andric 228681ad6265SDimitry Andric // Calculate the reconstruction mask for this shuffle, as the mask needed to 228781ad6265SDimitry Andric // take the packed values from Op0/Op1 and reconstructing to the original 228881ad6265SDimitry Andric // order. 228981ad6265SDimitry Andric SmallVector<int> ReconstructMask; 229081ad6265SDimitry Andric for (unsigned I = 0; I < Mask.size(); I++) { 229181ad6265SDimitry Andric if (Mask[I] < 0) { 229281ad6265SDimitry Andric ReconstructMask.push_back(-1); 229381ad6265SDimitry Andric } else if (Mask[I] < static_cast<int>(NumElts)) { 229481ad6265SDimitry Andric MaxV1Elt = std::max(MaxV1Elt, Mask[I]); 2295753f127fSDimitry Andric auto It = find_if(V1, [&](const std::pair<int, int> &A) { 2296753f127fSDimitry Andric return Mask[I] == A.first; 2297753f127fSDimitry Andric }); 229881ad6265SDimitry Andric if (It != V1.end()) 229981ad6265SDimitry Andric ReconstructMask.push_back(It - V1.begin()); 230081ad6265SDimitry Andric else { 230181ad6265SDimitry Andric ReconstructMask.push_back(V1.size()); 2302753f127fSDimitry Andric V1.emplace_back(Mask[I], V1.size()); 230381ad6265SDimitry Andric } 230481ad6265SDimitry Andric } else { 230581ad6265SDimitry Andric MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); 2306753f127fSDimitry Andric auto It = find_if(V2, [&](const std::pair<int, int> &A) { 2307753f127fSDimitry Andric return Mask[I] - static_cast<int>(NumElts) == A.first; 2308753f127fSDimitry Andric }); 230981ad6265SDimitry Andric if (It != V2.end()) 231081ad6265SDimitry Andric ReconstructMask.push_back(NumElts + It - V2.begin()); 231181ad6265SDimitry Andric else { 231281ad6265SDimitry Andric ReconstructMask.push_back(NumElts + V2.size()); 2313753f127fSDimitry Andric V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); 231481ad6265SDimitry Andric } 231581ad6265SDimitry Andric } 231681ad6265SDimitry Andric } 231781ad6265SDimitry Andric 231881ad6265SDimitry Andric // For reductions, we know that the lane ordering out doesn't alter the 231981ad6265SDimitry Andric // result. In-order can help simplify the shuffle away. 232081ad6265SDimitry Andric if (FromReduction) 232181ad6265SDimitry Andric sort(ReconstructMask); 2322753f127fSDimitry Andric OrigReconstructMasks.push_back(std::move(ReconstructMask)); 232381ad6265SDimitry Andric } 232481ad6265SDimitry Andric 232581ad6265SDimitry Andric // If the Maximum element used from V1 and V2 are not larger than the new 232681ad6265SDimitry Andric // vectors, the vectors are already packes and performing the optimization 232781ad6265SDimitry Andric // again will likely not help any further. This also prevents us from getting 232881ad6265SDimitry Andric // stuck in a cycle in case the costs do not also rule it out. 232981ad6265SDimitry Andric if (V1.empty() || V2.empty() || 233081ad6265SDimitry Andric (MaxV1Elt == static_cast<int>(V1.size()) - 1 && 233181ad6265SDimitry Andric MaxV2Elt == static_cast<int>(V2.size()) - 1)) 233281ad6265SDimitry Andric return false; 233381ad6265SDimitry Andric 2334753f127fSDimitry Andric // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a 2335753f127fSDimitry Andric // shuffle of another shuffle, or not a shuffle (that is treated like a 2336753f127fSDimitry Andric // identity shuffle). 2337753f127fSDimitry Andric auto GetBaseMaskValue = [&](Instruction *I, int M) { 2338753f127fSDimitry Andric auto *SV = dyn_cast<ShuffleVectorInst>(I); 2339753f127fSDimitry Andric if (!SV) 2340753f127fSDimitry Andric return M; 2341753f127fSDimitry Andric if (isa<UndefValue>(SV->getOperand(1))) 2342753f127fSDimitry Andric if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2343753f127fSDimitry Andric if (InputShuffles.contains(SSV)) 2344753f127fSDimitry Andric return SSV->getMaskValue(SV->getMaskValue(M)); 2345753f127fSDimitry Andric return SV->getMaskValue(M); 2346753f127fSDimitry Andric }; 2347753f127fSDimitry Andric 2348753f127fSDimitry Andric // Attempt to sort the inputs my ascending mask values to make simpler input 2349753f127fSDimitry Andric // shuffles and push complex shuffles down to the uses. We sort on the first 2350753f127fSDimitry Andric // of the two input shuffle orders, to try and get at least one input into a 2351753f127fSDimitry Andric // nice order. 2352753f127fSDimitry Andric auto SortBase = [&](Instruction *A, std::pair<int, int> X, 2353753f127fSDimitry Andric std::pair<int, int> Y) { 2354753f127fSDimitry Andric int MXA = GetBaseMaskValue(A, X.first); 2355753f127fSDimitry Andric int MYA = GetBaseMaskValue(A, Y.first); 2356753f127fSDimitry Andric return MXA < MYA; 2357753f127fSDimitry Andric }; 2358753f127fSDimitry Andric stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { 2359753f127fSDimitry Andric return SortBase(SVI0A, A, B); 2360753f127fSDimitry Andric }); 2361753f127fSDimitry Andric stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { 2362753f127fSDimitry Andric return SortBase(SVI1A, A, B); 2363753f127fSDimitry Andric }); 2364753f127fSDimitry Andric // Calculate our ReconstructMasks from the OrigReconstructMasks and the 2365753f127fSDimitry Andric // modified order of the input shuffles. 2366753f127fSDimitry Andric SmallVector<SmallVector<int>> ReconstructMasks; 236706c3fb27SDimitry Andric for (const auto &Mask : OrigReconstructMasks) { 2368753f127fSDimitry Andric SmallVector<int> ReconstructMask; 2369753f127fSDimitry Andric for (int M : Mask) { 2370753f127fSDimitry Andric auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { 2371753f127fSDimitry Andric auto It = find_if(V, [M](auto A) { return A.second == M; }); 2372753f127fSDimitry Andric assert(It != V.end() && "Expected all entries in Mask"); 2373753f127fSDimitry Andric return std::distance(V.begin(), It); 2374753f127fSDimitry Andric }; 2375753f127fSDimitry Andric if (M < 0) 2376753f127fSDimitry Andric ReconstructMask.push_back(-1); 2377753f127fSDimitry Andric else if (M < static_cast<int>(NumElts)) { 2378753f127fSDimitry Andric ReconstructMask.push_back(FindIndex(V1, M)); 2379753f127fSDimitry Andric } else { 2380753f127fSDimitry Andric ReconstructMask.push_back(NumElts + FindIndex(V2, M)); 2381753f127fSDimitry Andric } 2382753f127fSDimitry Andric } 2383753f127fSDimitry Andric ReconstructMasks.push_back(std::move(ReconstructMask)); 2384753f127fSDimitry Andric } 2385753f127fSDimitry Andric 238681ad6265SDimitry Andric // Calculate the masks needed for the new input shuffles, which get padded 238781ad6265SDimitry Andric // with undef 238881ad6265SDimitry Andric SmallVector<int> V1A, V1B, V2A, V2B; 238981ad6265SDimitry Andric for (unsigned I = 0; I < V1.size(); I++) { 2390753f127fSDimitry Andric V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); 2391753f127fSDimitry Andric V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); 239281ad6265SDimitry Andric } 239381ad6265SDimitry Andric for (unsigned I = 0; I < V2.size(); I++) { 2394753f127fSDimitry Andric V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); 2395753f127fSDimitry Andric V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); 239681ad6265SDimitry Andric } 239781ad6265SDimitry Andric while (V1A.size() < NumElts) { 239806c3fb27SDimitry Andric V1A.push_back(PoisonMaskElem); 239906c3fb27SDimitry Andric V1B.push_back(PoisonMaskElem); 240081ad6265SDimitry Andric } 240181ad6265SDimitry Andric while (V2A.size() < NumElts) { 240206c3fb27SDimitry Andric V2A.push_back(PoisonMaskElem); 240306c3fb27SDimitry Andric V2B.push_back(PoisonMaskElem); 240481ad6265SDimitry Andric } 240581ad6265SDimitry Andric 2406753f127fSDimitry Andric auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { 2407753f127fSDimitry Andric auto *SV = dyn_cast<ShuffleVectorInst>(I); 2408753f127fSDimitry Andric if (!SV) 2409753f127fSDimitry Andric return C; 2410753f127fSDimitry Andric return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) 2411753f127fSDimitry Andric ? TTI::SK_PermuteSingleSrc 2412753f127fSDimitry Andric : TTI::SK_PermuteTwoSrc, 2413753f127fSDimitry Andric VT, SV->getShuffleMask()); 241481ad6265SDimitry Andric }; 241581ad6265SDimitry Andric auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { 241681ad6265SDimitry Andric return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); 241781ad6265SDimitry Andric }; 241881ad6265SDimitry Andric 241981ad6265SDimitry Andric // Get the costs of the shuffles + binops before and after with the new 242081ad6265SDimitry Andric // shuffle masks. 242181ad6265SDimitry Andric InstructionCost CostBefore = 242281ad6265SDimitry Andric TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + 242381ad6265SDimitry Andric TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); 242481ad6265SDimitry Andric CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), 242581ad6265SDimitry Andric InstructionCost(0), AddShuffleCost); 242681ad6265SDimitry Andric CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), 242781ad6265SDimitry Andric InstructionCost(0), AddShuffleCost); 242881ad6265SDimitry Andric 242981ad6265SDimitry Andric // The new binops will be unused for lanes past the used shuffle lengths. 243081ad6265SDimitry Andric // These types attempt to get the correct cost for that from the target. 243181ad6265SDimitry Andric FixedVectorType *Op0SmallVT = 243281ad6265SDimitry Andric FixedVectorType::get(VT->getScalarType(), V1.size()); 243381ad6265SDimitry Andric FixedVectorType *Op1SmallVT = 243481ad6265SDimitry Andric FixedVectorType::get(VT->getScalarType(), V2.size()); 243581ad6265SDimitry Andric InstructionCost CostAfter = 243681ad6265SDimitry Andric TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + 243781ad6265SDimitry Andric TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); 243881ad6265SDimitry Andric CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), 243981ad6265SDimitry Andric InstructionCost(0), AddShuffleMaskCost); 244081ad6265SDimitry Andric std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); 244181ad6265SDimitry Andric CostAfter += 244281ad6265SDimitry Andric std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), 244381ad6265SDimitry Andric InstructionCost(0), AddShuffleMaskCost); 244481ad6265SDimitry Andric 2445753f127fSDimitry Andric LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); 2446753f127fSDimitry Andric LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore 2447753f127fSDimitry Andric << " vs CostAfter: " << CostAfter << "\n"); 244881ad6265SDimitry Andric if (CostBefore <= CostAfter) 244981ad6265SDimitry Andric return false; 245081ad6265SDimitry Andric 245181ad6265SDimitry Andric // The cost model has passed, create the new instructions. 2452753f127fSDimitry Andric auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { 2453753f127fSDimitry Andric auto *SV = dyn_cast<ShuffleVectorInst>(I); 2454753f127fSDimitry Andric if (!SV) 2455753f127fSDimitry Andric return I; 2456753f127fSDimitry Andric if (isa<UndefValue>(SV->getOperand(1))) 2457753f127fSDimitry Andric if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2458753f127fSDimitry Andric if (InputShuffles.contains(SSV)) 2459753f127fSDimitry Andric return SSV->getOperand(Op); 2460753f127fSDimitry Andric return SV->getOperand(Op); 2461753f127fSDimitry Andric }; 24625f757f3fSDimitry Andric Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); 2463753f127fSDimitry Andric Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), 2464753f127fSDimitry Andric GetShuffleOperand(SVI0A, 1), V1A); 24655f757f3fSDimitry Andric Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); 2466753f127fSDimitry Andric Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), 2467753f127fSDimitry Andric GetShuffleOperand(SVI0B, 1), V1B); 24685f757f3fSDimitry Andric Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); 2469753f127fSDimitry Andric Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), 2470753f127fSDimitry Andric GetShuffleOperand(SVI1A, 1), V2A); 24715f757f3fSDimitry Andric Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); 2472753f127fSDimitry Andric Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), 2473753f127fSDimitry Andric GetShuffleOperand(SVI1B, 1), V2B); 247481ad6265SDimitry Andric Builder.SetInsertPoint(Op0); 247581ad6265SDimitry Andric Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), 247681ad6265SDimitry Andric NSV0A, NSV0B); 247781ad6265SDimitry Andric if (auto *I = dyn_cast<Instruction>(NOp0)) 247881ad6265SDimitry Andric I->copyIRFlags(Op0, true); 247981ad6265SDimitry Andric Builder.SetInsertPoint(Op1); 248081ad6265SDimitry Andric Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), 248181ad6265SDimitry Andric NSV1A, NSV1B); 248281ad6265SDimitry Andric if (auto *I = dyn_cast<Instruction>(NOp1)) 248381ad6265SDimitry Andric I->copyIRFlags(Op1, true); 248481ad6265SDimitry Andric 248581ad6265SDimitry Andric for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { 248681ad6265SDimitry Andric Builder.SetInsertPoint(Shuffles[S]); 248781ad6265SDimitry Andric Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); 248881ad6265SDimitry Andric replaceValue(*Shuffles[S], *NSV); 248981ad6265SDimitry Andric } 249081ad6265SDimitry Andric 249181ad6265SDimitry Andric Worklist.pushValue(NSV0A); 249281ad6265SDimitry Andric Worklist.pushValue(NSV0B); 249381ad6265SDimitry Andric Worklist.pushValue(NSV1A); 249481ad6265SDimitry Andric Worklist.pushValue(NSV1B); 249581ad6265SDimitry Andric for (auto *S : Shuffles) 249681ad6265SDimitry Andric Worklist.add(S); 249781ad6265SDimitry Andric return true; 249881ad6265SDimitry Andric } 249981ad6265SDimitry Andric 25005ffd83dbSDimitry Andric /// This is the entry point for all transforms. Pass manager differences are 25015ffd83dbSDimitry Andric /// handled in the callers of this function. 25025ffd83dbSDimitry Andric bool VectorCombine::run() { 25035ffd83dbSDimitry Andric if (DisableVectorCombine) 25045ffd83dbSDimitry Andric return false; 25055ffd83dbSDimitry Andric 2506e8d8bef9SDimitry Andric // Don't attempt vectorization if the target does not support vectors. 2507e8d8bef9SDimitry Andric if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 2508e8d8bef9SDimitry Andric return false; 2509e8d8bef9SDimitry Andric 25105ffd83dbSDimitry Andric bool MadeChange = false; 2511349cc55cSDimitry Andric auto FoldInst = [this, &MadeChange](Instruction &I) { 2512349cc55cSDimitry Andric Builder.SetInsertPoint(&I); 2513bdd1243dSDimitry Andric bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); 2514bdd1243dSDimitry Andric auto Opcode = I.getOpcode(); 2515bdd1243dSDimitry Andric 2516bdd1243dSDimitry Andric // These folds should be beneficial regardless of when this pass is run 2517bdd1243dSDimitry Andric // in the optimization pipeline. 2518bdd1243dSDimitry Andric // The type checking is for run-time efficiency. We can avoid wasting time 2519bdd1243dSDimitry Andric // dispatching to folding functions if there's no chance of matching. 2520bdd1243dSDimitry Andric if (IsFixedVectorType) { 2521bdd1243dSDimitry Andric switch (Opcode) { 2522bdd1243dSDimitry Andric case Instruction::InsertElement: 2523349cc55cSDimitry Andric MadeChange |= vectorizeLoadInsert(I); 2524bdd1243dSDimitry Andric break; 2525bdd1243dSDimitry Andric case Instruction::ShuffleVector: 2526bdd1243dSDimitry Andric MadeChange |= widenSubvectorLoad(I); 2527bdd1243dSDimitry Andric break; 2528bdd1243dSDimitry Andric default: 2529bdd1243dSDimitry Andric break; 2530bdd1243dSDimitry Andric } 2531bdd1243dSDimitry Andric } 2532bdd1243dSDimitry Andric 2533bdd1243dSDimitry Andric // This transform works with scalable and fixed vectors 2534bdd1243dSDimitry Andric // TODO: Identify and allow other scalable transforms 25355f757f3fSDimitry Andric if (isa<VectorType>(I.getType())) { 2536bdd1243dSDimitry Andric MadeChange |= scalarizeBinopOrCmp(I); 25375f757f3fSDimitry Andric MadeChange |= scalarizeLoadExtract(I); 25385f757f3fSDimitry Andric MadeChange |= scalarizeVPIntrinsic(I); 25395f757f3fSDimitry Andric } 2540bdd1243dSDimitry Andric 2541bdd1243dSDimitry Andric if (Opcode == Instruction::Store) 2542349cc55cSDimitry Andric MadeChange |= foldSingleElementStore(I); 2543bdd1243dSDimitry Andric 2544bdd1243dSDimitry Andric // If this is an early pipeline invocation of this pass, we are done. 2545bdd1243dSDimitry Andric if (TryEarlyFoldsOnly) 2546bdd1243dSDimitry Andric return; 2547bdd1243dSDimitry Andric 2548bdd1243dSDimitry Andric // Otherwise, try folds that improve codegen but may interfere with 2549bdd1243dSDimitry Andric // early IR canonicalizations. 2550bdd1243dSDimitry Andric // The type checking is for run-time efficiency. We can avoid wasting time 2551bdd1243dSDimitry Andric // dispatching to folding functions if there's no chance of matching. 2552bdd1243dSDimitry Andric if (IsFixedVectorType) { 2553bdd1243dSDimitry Andric switch (Opcode) { 2554bdd1243dSDimitry Andric case Instruction::InsertElement: 2555bdd1243dSDimitry Andric MadeChange |= foldInsExtFNeg(I); 2556bdd1243dSDimitry Andric break; 2557bdd1243dSDimitry Andric case Instruction::ShuffleVector: 2558bdd1243dSDimitry Andric MadeChange |= foldShuffleOfBinops(I); 25590fca6ea1SDimitry Andric MadeChange |= foldShuffleOfCastops(I); 25600fca6ea1SDimitry Andric MadeChange |= foldShuffleOfShuffles(I); 2561bdd1243dSDimitry Andric MadeChange |= foldSelectShuffle(I); 25620fca6ea1SDimitry Andric MadeChange |= foldShuffleToIdentity(I); 2563bdd1243dSDimitry Andric break; 2564bdd1243dSDimitry Andric case Instruction::BitCast: 25655f757f3fSDimitry Andric MadeChange |= foldBitcastShuffle(I); 2566bdd1243dSDimitry Andric break; 2567bdd1243dSDimitry Andric } 2568bdd1243dSDimitry Andric } else { 2569bdd1243dSDimitry Andric switch (Opcode) { 2570bdd1243dSDimitry Andric case Instruction::Call: 2571bdd1243dSDimitry Andric MadeChange |= foldShuffleFromReductions(I); 25720fca6ea1SDimitry Andric MadeChange |= foldCastFromReductions(I); 2573bdd1243dSDimitry Andric break; 2574bdd1243dSDimitry Andric case Instruction::ICmp: 2575bdd1243dSDimitry Andric case Instruction::FCmp: 2576bdd1243dSDimitry Andric MadeChange |= foldExtractExtract(I); 2577bdd1243dSDimitry Andric break; 2578bdd1243dSDimitry Andric default: 2579bdd1243dSDimitry Andric if (Instruction::isBinaryOp(Opcode)) { 2580bdd1243dSDimitry Andric MadeChange |= foldExtractExtract(I); 2581bdd1243dSDimitry Andric MadeChange |= foldExtractedCmps(I); 2582bdd1243dSDimitry Andric } 2583bdd1243dSDimitry Andric break; 2584bdd1243dSDimitry Andric } 2585bdd1243dSDimitry Andric } 2586349cc55cSDimitry Andric }; 2587bdd1243dSDimitry Andric 25885ffd83dbSDimitry Andric for (BasicBlock &BB : F) { 25895ffd83dbSDimitry Andric // Ignore unreachable basic blocks. 25905ffd83dbSDimitry Andric if (!DT.isReachableFromEntry(&BB)) 25915ffd83dbSDimitry Andric continue; 2592fe6060f1SDimitry Andric // Use early increment range so that we can erase instructions in loop. 2593fe6060f1SDimitry Andric for (Instruction &I : make_early_inc_range(BB)) { 2594349cc55cSDimitry Andric if (I.isDebugOrPseudoInst()) 25955ffd83dbSDimitry Andric continue; 2596349cc55cSDimitry Andric FoldInst(I); 25975ffd83dbSDimitry Andric } 25985ffd83dbSDimitry Andric } 25995ffd83dbSDimitry Andric 2600349cc55cSDimitry Andric while (!Worklist.isEmpty()) { 2601349cc55cSDimitry Andric Instruction *I = Worklist.removeOne(); 2602349cc55cSDimitry Andric if (!I) 2603349cc55cSDimitry Andric continue; 2604349cc55cSDimitry Andric 2605349cc55cSDimitry Andric if (isInstructionTriviallyDead(I)) { 2606349cc55cSDimitry Andric eraseInstruction(*I); 2607349cc55cSDimitry Andric continue; 2608349cc55cSDimitry Andric } 2609349cc55cSDimitry Andric 2610349cc55cSDimitry Andric FoldInst(*I); 2611349cc55cSDimitry Andric } 26125ffd83dbSDimitry Andric 26135ffd83dbSDimitry Andric return MadeChange; 26145ffd83dbSDimitry Andric } 26155ffd83dbSDimitry Andric 26165ffd83dbSDimitry Andric PreservedAnalyses VectorCombinePass::run(Function &F, 26175ffd83dbSDimitry Andric FunctionAnalysisManager &FAM) { 2618fe6060f1SDimitry Andric auto &AC = FAM.getResult<AssumptionAnalysis>(F); 26195ffd83dbSDimitry Andric TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 26205ffd83dbSDimitry Andric DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 2621fe6060f1SDimitry Andric AAResults &AA = FAM.getResult<AAManager>(F); 26220fca6ea1SDimitry Andric const DataLayout *DL = &F.getDataLayout(); 26230fca6ea1SDimitry Andric VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly); 26245ffd83dbSDimitry Andric if (!Combiner.run()) 26255ffd83dbSDimitry Andric return PreservedAnalyses::all(); 26265ffd83dbSDimitry Andric PreservedAnalyses PA; 26275ffd83dbSDimitry Andric PA.preserveSet<CFGAnalyses>(); 26285ffd83dbSDimitry Andric return PA; 26295ffd83dbSDimitry Andric } 2630