xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 9b496deb90381f352a2bceb8794a0c7881a63bd5)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanPatternMatch.h"
17 #include "VPlanUtils.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/IVDescriptors.h"
22 #include "llvm/IR/BasicBlock.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instruction.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/IR/VectorBuilder.h"
30 #include "llvm/Support/Casting.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
37 #include <cassert>
38 
39 using namespace llvm;
40 
41 using VectorParts = SmallVector<Value *, 2>;
42 
43 namespace llvm {
44 extern cl::opt<bool> EnableVPlanNativePath;
45 }
46 extern cl::opt<unsigned> ForceTargetInstructionCost;
47 
48 #define LV_NAME "loop-vectorize"
49 #define DEBUG_TYPE LV_NAME
50 
51 bool VPRecipeBase::mayWriteToMemory() const {
52   switch (getVPDefID()) {
53   case VPInstructionSC:
54     if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
55       return false;
56     switch (cast<VPInstruction>(this)->getOpcode()) {
57     case Instruction::Or:
58     case Instruction::ICmp:
59     case Instruction::Select:
60     case VPInstruction::AnyOf:
61     case VPInstruction::Not:
62     case VPInstruction::CalculateTripCountMinusVF:
63     case VPInstruction::CanonicalIVIncrementForPart:
64     case VPInstruction::ExtractFromEnd:
65     case VPInstruction::FirstOrderRecurrenceSplice:
66     case VPInstruction::LogicalAnd:
67     case VPInstruction::PtrAdd:
68       return false;
69     default:
70       return true;
71     }
72   case VPInterleaveSC:
73     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
74   case VPWidenStoreEVLSC:
75   case VPWidenStoreSC:
76     return true;
77   case VPReplicateSC:
78     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
79         ->mayWriteToMemory();
80   case VPWidenCallSC:
81     return !cast<VPWidenCallRecipe>(this)
82                 ->getCalledScalarFunction()
83                 ->onlyReadsMemory();
84   case VPWidenIntrinsicSC:
85     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
86   case VPBranchOnMaskSC:
87   case VPScalarIVStepsSC:
88   case VPPredInstPHISC:
89     return false;
90   case VPBlendSC:
91   case VPReductionEVLSC:
92   case VPReductionSC:
93   case VPVectorPointerSC:
94   case VPWidenCanonicalIVSC:
95   case VPWidenCastSC:
96   case VPWidenGEPSC:
97   case VPWidenIntOrFpInductionSC:
98   case VPWidenLoadEVLSC:
99   case VPWidenLoadSC:
100   case VPWidenPHISC:
101   case VPWidenSC:
102   case VPWidenEVLSC:
103   case VPWidenSelectSC: {
104     const Instruction *I =
105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106     (void)I;
107     assert((!I || !I->mayWriteToMemory()) &&
108            "underlying instruction may write to memory");
109     return false;
110   }
111   default:
112     return true;
113   }
114 }
115 
116 bool VPRecipeBase::mayReadFromMemory() const {
117   switch (getVPDefID()) {
118   case VPWidenLoadEVLSC:
119   case VPWidenLoadSC:
120     return true;
121   case VPReplicateSC:
122     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
123         ->mayReadFromMemory();
124   case VPWidenCallSC:
125     return !cast<VPWidenCallRecipe>(this)
126                 ->getCalledScalarFunction()
127                 ->onlyWritesMemory();
128   case VPWidenIntrinsicSC:
129     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
130   case VPBranchOnMaskSC:
131   case VPPredInstPHISC:
132   case VPScalarIVStepsSC:
133   case VPWidenStoreEVLSC:
134   case VPWidenStoreSC:
135     return false;
136   case VPBlendSC:
137   case VPReductionEVLSC:
138   case VPReductionSC:
139   case VPVectorPointerSC:
140   case VPWidenCanonicalIVSC:
141   case VPWidenCastSC:
142   case VPWidenGEPSC:
143   case VPWidenIntOrFpInductionSC:
144   case VPWidenPHISC:
145   case VPWidenSC:
146   case VPWidenEVLSC:
147   case VPWidenSelectSC: {
148     const Instruction *I =
149         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
150     (void)I;
151     assert((!I || !I->mayReadFromMemory()) &&
152            "underlying instruction may read from memory");
153     return false;
154   }
155   default:
156     return true;
157   }
158 }
159 
160 bool VPRecipeBase::mayHaveSideEffects() const {
161   switch (getVPDefID()) {
162   case VPDerivedIVSC:
163   case VPPredInstPHISC:
164   case VPScalarCastSC:
165   case VPReverseVectorPointerSC:
166     return false;
167   case VPInstructionSC:
168     return mayWriteToMemory();
169   case VPWidenCallSC: {
170     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
171     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
172   }
173   case VPWidenIntrinsicSC:
174     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
175   case VPBlendSC:
176   case VPReductionEVLSC:
177   case VPReductionSC:
178   case VPScalarIVStepsSC:
179   case VPVectorPointerSC:
180   case VPWidenCanonicalIVSC:
181   case VPWidenCastSC:
182   case VPWidenGEPSC:
183   case VPWidenIntOrFpInductionSC:
184   case VPWidenPHISC:
185   case VPWidenPointerInductionSC:
186   case VPWidenSC:
187   case VPWidenEVLSC:
188   case VPWidenSelectSC: {
189     const Instruction *I =
190         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
191     (void)I;
192     assert((!I || !I->mayHaveSideEffects()) &&
193            "underlying instruction has side-effects");
194     return false;
195   }
196   case VPInterleaveSC:
197     return mayWriteToMemory();
198   case VPWidenLoadEVLSC:
199   case VPWidenLoadSC:
200   case VPWidenStoreEVLSC:
201   case VPWidenStoreSC:
202     assert(
203         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
204             mayWriteToMemory() &&
205         "mayHaveSideffects result for ingredient differs from this "
206         "implementation");
207     return mayWriteToMemory();
208   case VPReplicateSC: {
209     auto *R = cast<VPReplicateRecipe>(this);
210     return R->getUnderlyingInstr()->mayHaveSideEffects();
211   }
212   default:
213     return true;
214   }
215 }
216 
217 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
218   assert(!Parent && "Recipe already in some VPBasicBlock");
219   assert(InsertPos->getParent() &&
220          "Insertion position not in any VPBasicBlock");
221   InsertPos->getParent()->insert(this, InsertPos->getIterator());
222 }
223 
224 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
225                                 iplist<VPRecipeBase>::iterator I) {
226   assert(!Parent && "Recipe already in some VPBasicBlock");
227   assert(I == BB.end() || I->getParent() == &BB);
228   BB.insert(this, I);
229 }
230 
231 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
232   assert(!Parent && "Recipe already in some VPBasicBlock");
233   assert(InsertPos->getParent() &&
234          "Insertion position not in any VPBasicBlock");
235   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
236 }
237 
238 void VPRecipeBase::removeFromParent() {
239   assert(getParent() && "Recipe not in any VPBasicBlock");
240   getParent()->getRecipeList().remove(getIterator());
241   Parent = nullptr;
242 }
243 
244 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
245   assert(getParent() && "Recipe not in any VPBasicBlock");
246   return getParent()->getRecipeList().erase(getIterator());
247 }
248 
249 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
250   removeFromParent();
251   insertAfter(InsertPos);
252 }
253 
254 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
255                               iplist<VPRecipeBase>::iterator I) {
256   removeFromParent();
257   insertBefore(BB, I);
258 }
259 
260 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
261   // Get the underlying instruction for the recipe, if there is one. It is used
262   // to
263   //   * decide if cost computation should be skipped for this recipe,
264   //   * apply forced target instruction cost.
265   Instruction *UI = nullptr;
266   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
267     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
268   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
269     UI = IG->getInsertPos();
270   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
271     UI = &WidenMem->getIngredient();
272 
273   InstructionCost RecipeCost;
274   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
275     RecipeCost = 0;
276   } else {
277     RecipeCost = computeCost(VF, Ctx);
278     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
279         RecipeCost.isValid())
280       RecipeCost = InstructionCost(ForceTargetInstructionCost);
281   }
282 
283   LLVM_DEBUG({
284     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
285     dump();
286   });
287   return RecipeCost;
288 }
289 
290 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
291                                           VPCostContext &Ctx) const {
292   llvm_unreachable("subclasses should implement computeCost");
293 }
294 
295 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
296   assert(OpType == OperationType::FPMathOp &&
297          "recipe doesn't have fast math flags");
298   FastMathFlags Res;
299   Res.setAllowReassoc(FMFs.AllowReassoc);
300   Res.setNoNaNs(FMFs.NoNaNs);
301   Res.setNoInfs(FMFs.NoInfs);
302   Res.setNoSignedZeros(FMFs.NoSignedZeros);
303   Res.setAllowReciprocal(FMFs.AllowReciprocal);
304   Res.setAllowContract(FMFs.AllowContract);
305   Res.setApproxFunc(FMFs.ApproxFunc);
306   return Res;
307 }
308 
309 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
310 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
311 #endif
312 
313 template <unsigned PartOpIdx>
314 VPValue *
315 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
316   if (U.getNumOperands() == PartOpIdx + 1)
317     return U.getOperand(PartOpIdx);
318   return nullptr;
319 }
320 
321 template <unsigned PartOpIdx>
322 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
323   if (auto *UnrollPartOp = getUnrollPartOperand(U))
324     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
325   return 0;
326 }
327 
328 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
329                              VPValue *A, VPValue *B, DebugLoc DL,
330                              const Twine &Name)
331     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
332                           Pred, DL),
333       Opcode(Opcode), Name(Name.str()) {
334   assert(Opcode == Instruction::ICmp &&
335          "only ICmp predicates supported at the moment");
336 }
337 
338 VPInstruction::VPInstruction(unsigned Opcode,
339                              std::initializer_list<VPValue *> Operands,
340                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
341     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
342       Opcode(Opcode), Name(Name.str()) {
343   // Make sure the VPInstruction is a floating-point operation.
344   assert(isFPMathOp() && "this op can't take fast-math flags");
345 }
346 
347 bool VPInstruction::doesGeneratePerAllLanes() const {
348   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
349 }
350 
351 bool VPInstruction::canGenerateScalarForFirstLane() const {
352   if (Instruction::isBinaryOp(getOpcode()))
353     return true;
354   if (isSingleScalar() || isVectorToScalar())
355     return true;
356   switch (Opcode) {
357   case Instruction::ICmp:
358   case Instruction::Select:
359   case VPInstruction::BranchOnCond:
360   case VPInstruction::BranchOnCount:
361   case VPInstruction::CalculateTripCountMinusVF:
362   case VPInstruction::CanonicalIVIncrementForPart:
363   case VPInstruction::PtrAdd:
364   case VPInstruction::ExplicitVectorLength:
365   case VPInstruction::AnyOf:
366     return true;
367   default:
368     return false;
369   }
370 }
371 
372 Value *VPInstruction::generatePerLane(VPTransformState &State,
373                                       const VPLane &Lane) {
374   IRBuilderBase &Builder = State.Builder;
375 
376   assert(getOpcode() == VPInstruction::PtrAdd &&
377          "only PtrAdd opcodes are supported for now");
378   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
379                               State.get(getOperand(1), Lane), Name);
380 }
381 
382 Value *VPInstruction::generate(VPTransformState &State) {
383   IRBuilderBase &Builder = State.Builder;
384 
385   if (Instruction::isBinaryOp(getOpcode())) {
386     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
387     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
388     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
389     auto *Res =
390         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
391     if (auto *I = dyn_cast<Instruction>(Res))
392       setFlags(I);
393     return Res;
394   }
395 
396   switch (getOpcode()) {
397   case VPInstruction::Not: {
398     Value *A = State.get(getOperand(0));
399     return Builder.CreateNot(A, Name);
400   }
401   case Instruction::ICmp: {
402     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
403     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
404     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
405     return Builder.CreateCmp(getPredicate(), A, B, Name);
406   }
407   case Instruction::Select: {
408     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
409     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
410     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
411     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
412     return Builder.CreateSelect(Cond, Op1, Op2, Name);
413   }
414   case VPInstruction::ActiveLaneMask: {
415     // Get first lane of vector induction variable.
416     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
417     // Get the original loop tripcount.
418     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
419 
420     // If this part of the active lane mask is scalar, generate the CMP directly
421     // to avoid unnecessary extracts.
422     if (State.VF.isScalar())
423       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
424                                Name);
425 
426     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
427     auto *PredTy = VectorType::get(Int1Ty, State.VF);
428     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
429                                    {PredTy, ScalarTC->getType()},
430                                    {VIVElem0, ScalarTC}, nullptr, Name);
431   }
432   case VPInstruction::FirstOrderRecurrenceSplice: {
433     // Generate code to combine the previous and current values in vector v3.
434     //
435     //   vector.ph:
436     //     v_init = vector(..., ..., ..., a[-1])
437     //     br vector.body
438     //
439     //   vector.body
440     //     i = phi [0, vector.ph], [i+4, vector.body]
441     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
442     //     v2 = a[i, i+1, i+2, i+3];
443     //     v3 = vector(v1(3), v2(0, 1, 2))
444 
445     auto *V1 = State.get(getOperand(0));
446     if (!V1->getType()->isVectorTy())
447       return V1;
448     Value *V2 = State.get(getOperand(1));
449     return Builder.CreateVectorSplice(V1, V2, -1, Name);
450   }
451   case VPInstruction::CalculateTripCountMinusVF: {
452     unsigned UF = getParent()->getPlan()->getUF();
453     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
454     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
455     Value *Sub = Builder.CreateSub(ScalarTC, Step);
456     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
457     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
458     return Builder.CreateSelect(Cmp, Sub, Zero);
459   }
460   case VPInstruction::ExplicitVectorLength: {
461     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
462     // be outside of the main loop.
463     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
464     // Compute EVL
465     assert(AVL->getType()->isIntegerTy() &&
466            "Requested vector length should be an integer.");
467 
468     assert(State.VF.isScalable() && "Expected scalable vector factor.");
469     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
470 
471     Value *EVL = State.Builder.CreateIntrinsic(
472         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
473         {AVL, VFArg, State.Builder.getTrue()});
474     return EVL;
475   }
476   case VPInstruction::CanonicalIVIncrementForPart: {
477     unsigned Part = getUnrollPart(*this);
478     auto *IV = State.get(getOperand(0), VPLane(0));
479     assert(Part != 0 && "Must have a positive part");
480     // The canonical IV is incremented by the vectorization factor (num of
481     // SIMD elements) times the unroll part.
482     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
483     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
484                              hasNoSignedWrap());
485   }
486   case VPInstruction::BranchOnCond: {
487     Value *Cond = State.get(getOperand(0), VPLane(0));
488     // Replace the temporary unreachable terminator with a new conditional
489     // branch, hooking it up to backward destination for exiting blocks now and
490     // to forward destination(s) later when they are created.
491     BranchInst *CondBr =
492         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
493     CondBr->setSuccessor(0, nullptr);
494     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
495 
496     if (!getParent()->isExiting())
497       return CondBr;
498 
499     VPRegionBlock *ParentRegion = getParent()->getParent();
500     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
501     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
502     return CondBr;
503   }
504   case VPInstruction::BranchOnCount: {
505     // First create the compare.
506     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
507     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
508     Value *Cond = Builder.CreateICmpEQ(IV, TC);
509 
510     // Now create the branch.
511     auto *Plan = getParent()->getPlan();
512     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
513     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
514 
515     // Replace the temporary unreachable terminator with a new conditional
516     // branch, hooking it up to backward destination (the header) now and to the
517     // forward destination (the exit/middle block) later when it is created.
518     // Note that CreateCondBr expects a valid BB as first argument, so we need
519     // to set it to nullptr later.
520     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
521                                               State.CFG.VPBB2IRBB[Header]);
522     CondBr->setSuccessor(0, nullptr);
523     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
524     return CondBr;
525   }
526   case VPInstruction::ComputeReductionResult: {
527     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
528     // and will be removed by breaking up the recipe further.
529     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
530     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
531     // Get its reduction variable descriptor.
532     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
533 
534     RecurKind RK = RdxDesc.getRecurrenceKind();
535 
536     Type *PhiTy = OrigPhi->getType();
537     // The recipe's operands are the reduction phi, followed by one operand for
538     // each part of the reduction.
539     unsigned UF = getNumOperands() - 1;
540     VectorParts RdxParts(UF);
541     for (unsigned Part = 0; Part < UF; ++Part)
542       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
543 
544     // If the vector reduction can be performed in a smaller type, we truncate
545     // then extend the loop exit value to enable InstCombine to evaluate the
546     // entire expression in the smaller type.
547     // TODO: Handle this in truncateToMinBW.
548     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
549       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
550       for (unsigned Part = 0; Part < UF; ++Part)
551         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
552     }
553     // Reduce all of the unrolled parts into a single vector.
554     Value *ReducedPartRdx = RdxParts[0];
555     unsigned Op = RdxDesc.getOpcode();
556     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
557       Op = Instruction::Or;
558 
559     if (PhiR->isOrdered()) {
560       ReducedPartRdx = RdxParts[UF - 1];
561     } else {
562       // Floating-point operations should have some FMF to enable the reduction.
563       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
564       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
565       for (unsigned Part = 1; Part < UF; ++Part) {
566         Value *RdxPart = RdxParts[Part];
567         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
568           ReducedPartRdx = Builder.CreateBinOp(
569               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
570         else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
571           ReducedPartRdx =
572               createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
573         else
574           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
575       }
576     }
577 
578     // Create the reduction after the loop. Note that inloop reductions create
579     // the target reduction in the loop using a Reduction recipe.
580     if ((State.VF.isVector() ||
581          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
582          RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
583         !PhiR->isInLoop()) {
584       ReducedPartRdx =
585           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
586       // If the reduction can be performed in a smaller type, we need to extend
587       // the reduction to the wider type before we branch to the original loop.
588       if (PhiTy != RdxDesc.getRecurrenceType())
589         ReducedPartRdx = RdxDesc.isSigned()
590                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
591                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
592     }
593 
594     return ReducedPartRdx;
595   }
596   case VPInstruction::ExtractFromEnd: {
597     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
598     unsigned Offset = CI->getZExtValue();
599     assert(Offset > 0 && "Offset from end must be positive");
600     Value *Res;
601     if (State.VF.isVector()) {
602       assert(Offset <= State.VF.getKnownMinValue() &&
603              "invalid offset to extract from");
604       // Extract lane VF - Offset from the operand.
605       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
606     } else {
607       assert(Offset <= 1 && "invalid offset to extract from");
608       Res = State.get(getOperand(0));
609     }
610     if (isa<ExtractElementInst>(Res))
611       Res->setName(Name);
612     return Res;
613   }
614   case VPInstruction::LogicalAnd: {
615     Value *A = State.get(getOperand(0));
616     Value *B = State.get(getOperand(1));
617     return Builder.CreateLogicalAnd(A, B, Name);
618   }
619   case VPInstruction::PtrAdd: {
620     assert(vputils::onlyFirstLaneUsed(this) &&
621            "can only generate first lane for PtrAdd");
622     Value *Ptr = State.get(getOperand(0), VPLane(0));
623     Value *Addend = State.get(getOperand(1), VPLane(0));
624     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
625   }
626   case VPInstruction::ResumePhi: {
627     Value *IncomingFromVPlanPred =
628         State.get(getOperand(0), /* IsScalar */ true);
629     Value *IncomingFromOtherPreds =
630         State.get(getOperand(1), /* IsScalar */ true);
631     auto *NewPhi =
632         Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
633     BasicBlock *VPlanPred =
634         State.CFG
635             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
636     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
637     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
638       if (OtherPred == VPlanPred)
639         continue;
640       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
641     }
642     return NewPhi;
643   }
644   case VPInstruction::AnyOf: {
645     Value *A = State.get(getOperand(0));
646     return Builder.CreateOrReduce(A);
647   }
648 
649   default:
650     llvm_unreachable("Unsupported opcode for instruction");
651   }
652 }
653 
654 bool VPInstruction::isVectorToScalar() const {
655   return getOpcode() == VPInstruction::ExtractFromEnd ||
656          getOpcode() == VPInstruction::ComputeReductionResult ||
657          getOpcode() == VPInstruction::AnyOf;
658 }
659 
660 bool VPInstruction::isSingleScalar() const {
661   return getOpcode() == VPInstruction::ResumePhi;
662 }
663 
664 #if !defined(NDEBUG)
665 bool VPInstruction::isFPMathOp() const {
666   // Inspired by FPMathOperator::classof. Notable differences are that we don't
667   // support Call, PHI and Select opcodes here yet.
668   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
669          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
670          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
671          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
672 }
673 #endif
674 
675 void VPInstruction::execute(VPTransformState &State) {
676   assert(!State.Lane && "VPInstruction executing an Lane");
677   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
678   assert((hasFastMathFlags() == isFPMathOp() ||
679           getOpcode() == Instruction::Select) &&
680          "Recipe not a FPMathOp but has fast-math flags?");
681   if (hasFastMathFlags())
682     State.Builder.setFastMathFlags(getFastMathFlags());
683   State.setDebugLocFrom(getDebugLoc());
684   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
685                                    (vputils::onlyFirstLaneUsed(this) ||
686                                     isVectorToScalar() || isSingleScalar());
687   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
688   if (GeneratesPerAllLanes) {
689     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
690          Lane != NumLanes; ++Lane) {
691       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
692       assert(GeneratedValue && "generatePerLane must produce a value");
693       State.set(this, GeneratedValue, VPLane(Lane));
694     }
695     return;
696   }
697 
698   Value *GeneratedValue = generate(State);
699   if (!hasResult())
700     return;
701   assert(GeneratedValue && "generate must produce a value");
702   assert(
703       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
704        State.VF.isScalar()) &&
705       "scalar value but not only first lane defined");
706   State.set(this, GeneratedValue,
707             /*IsScalar*/ GeneratesPerFirstLaneOnly);
708 }
709 
710 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
711   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
712   if (Instruction::isBinaryOp(getOpcode()))
713     return vputils::onlyFirstLaneUsed(this);
714 
715   switch (getOpcode()) {
716   default:
717     return false;
718   case Instruction::ICmp:
719   case Instruction::Select:
720   case Instruction::Or:
721   case VPInstruction::PtrAdd:
722     // TODO: Cover additional opcodes.
723     return vputils::onlyFirstLaneUsed(this);
724   case VPInstruction::ActiveLaneMask:
725   case VPInstruction::ExplicitVectorLength:
726   case VPInstruction::CalculateTripCountMinusVF:
727   case VPInstruction::CanonicalIVIncrementForPart:
728   case VPInstruction::BranchOnCount:
729   case VPInstruction::BranchOnCond:
730   case VPInstruction::ResumePhi:
731     return true;
732   };
733   llvm_unreachable("switch should return");
734 }
735 
736 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
737   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
738   if (Instruction::isBinaryOp(getOpcode()))
739     return vputils::onlyFirstPartUsed(this);
740 
741   switch (getOpcode()) {
742   default:
743     return false;
744   case Instruction::ICmp:
745   case Instruction::Select:
746     return vputils::onlyFirstPartUsed(this);
747   case VPInstruction::BranchOnCount:
748   case VPInstruction::BranchOnCond:
749   case VPInstruction::CanonicalIVIncrementForPart:
750     return true;
751   };
752   llvm_unreachable("switch should return");
753 }
754 
755 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
756 void VPInstruction::dump() const {
757   VPSlotTracker SlotTracker(getParent()->getPlan());
758   print(dbgs(), "", SlotTracker);
759 }
760 
761 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
762                           VPSlotTracker &SlotTracker) const {
763   O << Indent << "EMIT ";
764 
765   if (hasResult()) {
766     printAsOperand(O, SlotTracker);
767     O << " = ";
768   }
769 
770   switch (getOpcode()) {
771   case VPInstruction::Not:
772     O << "not";
773     break;
774   case VPInstruction::SLPLoad:
775     O << "combined load";
776     break;
777   case VPInstruction::SLPStore:
778     O << "combined store";
779     break;
780   case VPInstruction::ActiveLaneMask:
781     O << "active lane mask";
782     break;
783   case VPInstruction::ResumePhi:
784     O << "resume-phi";
785     break;
786   case VPInstruction::ExplicitVectorLength:
787     O << "EXPLICIT-VECTOR-LENGTH";
788     break;
789   case VPInstruction::FirstOrderRecurrenceSplice:
790     O << "first-order splice";
791     break;
792   case VPInstruction::BranchOnCond:
793     O << "branch-on-cond";
794     break;
795   case VPInstruction::CalculateTripCountMinusVF:
796     O << "TC > VF ? TC - VF : 0";
797     break;
798   case VPInstruction::CanonicalIVIncrementForPart:
799     O << "VF * Part +";
800     break;
801   case VPInstruction::BranchOnCount:
802     O << "branch-on-count";
803     break;
804   case VPInstruction::ExtractFromEnd:
805     O << "extract-from-end";
806     break;
807   case VPInstruction::ComputeReductionResult:
808     O << "compute-reduction-result";
809     break;
810   case VPInstruction::LogicalAnd:
811     O << "logical-and";
812     break;
813   case VPInstruction::PtrAdd:
814     O << "ptradd";
815     break;
816   case VPInstruction::AnyOf:
817     O << "any-of";
818     break;
819   default:
820     O << Instruction::getOpcodeName(getOpcode());
821   }
822 
823   printFlags(O);
824   printOperands(O, SlotTracker);
825 
826   if (auto DL = getDebugLoc()) {
827     O << ", !dbg ";
828     DL.print(O);
829   }
830 }
831 #endif
832 
833 void VPIRInstruction::execute(VPTransformState &State) {
834   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
835          "Only PHINodes can have extra operands");
836   for (const auto &[Idx, Op] : enumerate(operands())) {
837     VPValue *ExitValue = Op;
838     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
839                     ? VPLane::getFirstLane()
840                     : VPLane::getLastLaneForVF(State.VF);
841     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
842     auto *PredVPBB = Pred->getExitingBasicBlock();
843     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
844     // Set insertion point in PredBB in case an extract needs to be generated.
845     // TODO: Model extracts explicitly.
846     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
847     Value *V = State.get(ExitValue, VPLane(Lane));
848     auto *Phi = cast<PHINode>(&I);
849     // If there is no existing block for PredBB in the phi, add a new incoming
850     // value. Otherwise update the existing incoming value for PredBB.
851     if (Phi->getBasicBlockIndex(PredBB) == -1)
852       Phi->addIncoming(V, PredBB);
853     else
854       Phi->setIncomingValueForBlock(PredBB, V);
855   }
856 
857   // Advance the insert point after the wrapped IR instruction. This allows
858   // interleaving VPIRInstructions and other recipes.
859   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
860 }
861 
862 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
863                                              VPCostContext &Ctx) const {
864   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
865   // hence it does not contribute to the cost-modeling for the VPlan.
866   return 0;
867 }
868 
869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
870 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
871                             VPSlotTracker &SlotTracker) const {
872   O << Indent << "IR " << I;
873 
874   if (getNumOperands() != 0) {
875     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
876     interleaveComma(
877         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
878           Op.value()->printAsOperand(O, SlotTracker);
879           O << " from ";
880           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
881         });
882     O << ")";
883   }
884 }
885 #endif
886 
887 void VPWidenCallRecipe::execute(VPTransformState &State) {
888   assert(State.VF.isVector() && "not widening");
889   State.setDebugLocFrom(getDebugLoc());
890 
891   FunctionType *VFTy = Variant->getFunctionType();
892   // Add return type if intrinsic is overloaded on it.
893   SmallVector<Value *, 4> Args;
894   for (const auto &I : enumerate(arg_operands())) {
895     Value *Arg;
896     // Some vectorized function variants may also take a scalar argument,
897     // e.g. linear parameters for pointers. This needs to be the scalar value
898     // from the start of the respective part when interleaving.
899     if (!VFTy->getParamType(I.index())->isVectorTy())
900       Arg = State.get(I.value(), VPLane(0));
901     else
902       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
903     Args.push_back(Arg);
904   }
905 
906   assert(Variant != nullptr && "Can't create vector function.");
907 
908   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
909   SmallVector<OperandBundleDef, 1> OpBundles;
910   if (CI)
911     CI->getOperandBundlesAsDefs(OpBundles);
912 
913   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
914   setFlags(V);
915 
916   if (!V->getType()->isVoidTy())
917     State.set(this, V);
918   State.addMetadata(V, CI);
919 }
920 
921 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
922                                                VPCostContext &Ctx) const {
923   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
924   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
925                                   Variant->getFunctionType()->params(),
926                                   CostKind);
927 }
928 
929 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
930 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
931                               VPSlotTracker &SlotTracker) const {
932   O << Indent << "WIDEN-CALL ";
933 
934   Function *CalledFn = getCalledScalarFunction();
935   if (CalledFn->getReturnType()->isVoidTy())
936     O << "void ";
937   else {
938     printAsOperand(O, SlotTracker);
939     O << " = ";
940   }
941 
942   O << "call";
943   printFlags(O);
944   O << " @" << CalledFn->getName() << "(";
945   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
946     Op->printAsOperand(O, SlotTracker);
947   });
948   O << ")";
949 
950   O << " (using library function";
951   if (Variant->hasName())
952     O << ": " << Variant->getName();
953   O << ")";
954 }
955 #endif
956 
957 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
958   assert(State.VF.isVector() && "not widening");
959   State.setDebugLocFrom(getDebugLoc());
960 
961   SmallVector<Type *, 2> TysForDecl;
962   // Add return type if intrinsic is overloaded on it.
963   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
964     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
965   SmallVector<Value *, 4> Args;
966   for (const auto &I : enumerate(operands())) {
967     // Some intrinsics have a scalar argument - don't replace it with a
968     // vector.
969     Value *Arg;
970     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
971                                            State.TTI))
972       Arg = State.get(I.value(), VPLane(0));
973     else
974       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
975     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
976                                                State.TTI))
977       TysForDecl.push_back(Arg->getType());
978     Args.push_back(Arg);
979   }
980 
981   // Use vector version of the intrinsic.
982   Module *M = State.Builder.GetInsertBlock()->getModule();
983   Function *VectorF =
984       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
985   assert(VectorF &&
986          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
987 
988   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
989   SmallVector<OperandBundleDef, 1> OpBundles;
990   if (CI)
991     CI->getOperandBundlesAsDefs(OpBundles);
992 
993   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
994 
995   setFlags(V);
996 
997   if (!V->getType()->isVoidTy())
998     State.set(this, V);
999   State.addMetadata(V, CI);
1000 }
1001 
1002 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1003                                                     VPCostContext &Ctx) const {
1004   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1005 
1006   // Some backends analyze intrinsic arguments to determine cost. Use the
1007   // underlying value for the operand if it has one. Otherwise try to use the
1008   // operand of the underlying call instruction, if there is one. Otherwise
1009   // clear Arguments.
1010   // TODO: Rework TTI interface to be independent of concrete IR values.
1011   SmallVector<const Value *> Arguments;
1012   for (const auto &[Idx, Op] : enumerate(operands())) {
1013     auto *V = Op->getUnderlyingValue();
1014     if (!V) {
1015       // Push all the VP Intrinsic's ops into the Argments even if is nullptr.
1016       // Some VP Intrinsic's cost will assert the number of parameters.
1017       // Mainly appears in the following two scenarios:
1018       // 1. EVL Op is nullptr
1019       // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic
1020       if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
1021         Arguments.push_back(V);
1022         continue;
1023       }
1024       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1025         Arguments.push_back(UI->getArgOperand(Idx));
1026         continue;
1027       }
1028       Arguments.clear();
1029       break;
1030     }
1031     Arguments.push_back(V);
1032   }
1033 
1034   Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
1035   SmallVector<Type *> ParamTys;
1036   for (unsigned I = 0; I != getNumOperands(); ++I)
1037     ParamTys.push_back(
1038         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1039 
1040   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1041   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1042   IntrinsicCostAttributes CostAttrs(
1043       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1044       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1045   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
1046 }
1047 
1048 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1049   return Intrinsic::getBaseName(VectorIntrinsicID);
1050 }
1051 
1052 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1053   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1054   // Vector predication intrinsics only demand the the first lane the last
1055   // operand (the EVL operand).
1056   return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1057          Op == getOperand(getNumOperands() - 1);
1058 }
1059 
1060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1061 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1062                                    VPSlotTracker &SlotTracker) const {
1063   O << Indent << "WIDEN-INTRINSIC ";
1064   if (ResultTy->isVoidTy()) {
1065     O << "void ";
1066   } else {
1067     printAsOperand(O, SlotTracker);
1068     O << " = ";
1069   }
1070 
1071   O << "call";
1072   printFlags(O);
1073   O << getIntrinsicName() << "(";
1074 
1075   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1076     Op->printAsOperand(O, SlotTracker);
1077   });
1078   O << ")";
1079 }
1080 #endif
1081 
1082 void VPHistogramRecipe::execute(VPTransformState &State) {
1083   State.setDebugLocFrom(getDebugLoc());
1084   IRBuilderBase &Builder = State.Builder;
1085 
1086   Value *Address = State.get(getOperand(0));
1087   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1088   VectorType *VTy = cast<VectorType>(Address->getType());
1089 
1090   // The histogram intrinsic requires a mask even if the recipe doesn't;
1091   // if the mask operand was omitted then all lanes should be executed and
1092   // we just need to synthesize an all-true mask.
1093   Value *Mask = nullptr;
1094   if (VPValue *VPMask = getMask())
1095     Mask = State.get(VPMask);
1096   else
1097     Mask =
1098         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1099 
1100   // If this is a subtract, we want to invert the increment amount. We may
1101   // add a separate intrinsic in future, but for now we'll try this.
1102   if (Opcode == Instruction::Sub)
1103     IncAmt = Builder.CreateNeg(IncAmt);
1104   else
1105     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1106 
1107   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1108                                 {VTy, IncAmt->getType()},
1109                                 {Address, IncAmt, Mask});
1110 }
1111 
1112 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1113                                                VPCostContext &Ctx) const {
1114   // FIXME: Take the gather and scatter into account as well. For now we're
1115   //        generating the same cost as the fallback path, but we'll likely
1116   //        need to create a new TTI method for determining the cost, including
1117   //        whether we can use base + vec-of-smaller-indices or just
1118   //        vec-of-pointers.
1119   assert(VF.isVector() && "Invalid VF for histogram cost");
1120   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1121   VPValue *IncAmt = getOperand(1);
1122   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1123   VectorType *VTy = VectorType::get(IncTy, VF);
1124 
1125   // Assume that a non-constant update value (or a constant != 1) requires
1126   // a multiply, and add that into the cost.
1127   InstructionCost MulCost =
1128       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
1129   if (IncAmt->isLiveIn()) {
1130     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1131 
1132     if (CI && CI->getZExtValue() == 1)
1133       MulCost = TTI::TCC_Free;
1134   }
1135 
1136   // Find the cost of the histogram operation itself.
1137   Type *PtrTy = VectorType::get(AddressTy, VF);
1138   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1139   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1140                               Type::getVoidTy(Ctx.LLVMCtx),
1141                               {PtrTy, IncTy, MaskTy});
1142 
1143   // Add the costs together with the add/sub operation.
1144   return Ctx.TTI.getIntrinsicInstrCost(
1145              ICA, TargetTransformInfo::TCK_RecipThroughput) +
1146          MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
1147 }
1148 
1149 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1150 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1151                               VPSlotTracker &SlotTracker) const {
1152   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1153   getOperand(0)->printAsOperand(O, SlotTracker);
1154 
1155   if (Opcode == Instruction::Sub)
1156     O << ", dec: ";
1157   else {
1158     assert(Opcode == Instruction::Add);
1159     O << ", inc: ";
1160   }
1161   getOperand(1)->printAsOperand(O, SlotTracker);
1162 
1163   if (VPValue *Mask = getMask()) {
1164     O << ", mask: ";
1165     Mask->printAsOperand(O, SlotTracker);
1166   }
1167 }
1168 
1169 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1170                                 VPSlotTracker &SlotTracker) const {
1171   O << Indent << "WIDEN-SELECT ";
1172   printAsOperand(O, SlotTracker);
1173   O << " = select ";
1174   getOperand(0)->printAsOperand(O, SlotTracker);
1175   O << ", ";
1176   getOperand(1)->printAsOperand(O, SlotTracker);
1177   O << ", ";
1178   getOperand(2)->printAsOperand(O, SlotTracker);
1179   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1180 }
1181 #endif
1182 
1183 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1184   State.setDebugLocFrom(getDebugLoc());
1185 
1186   // The condition can be loop invariant but still defined inside the
1187   // loop. This means that we can't just use the original 'cond' value.
1188   // We have to take the 'vectorized' value and pick the first lane.
1189   // Instcombine will make this a no-op.
1190   auto *InvarCond =
1191       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1192 
1193   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1194   Value *Op0 = State.get(getOperand(1));
1195   Value *Op1 = State.get(getOperand(2));
1196   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1197   State.set(this, Sel);
1198   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1199 }
1200 
1201 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1202                                                  VPCostContext &Ctx) const {
1203   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1204   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1205   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1206   Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
1207   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1208 
1209   VPValue *Op0, *Op1;
1210   using namespace llvm::VPlanPatternMatch;
1211   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1212       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1213        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1214     // select x, y, false --> x & y
1215     // select x, true, y --> x | y
1216     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1217     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1218 
1219     SmallVector<const Value *, 2> Operands;
1220     if (all_of(operands(),
1221                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1222       Operands.append(SI->op_begin(), SI->op_end());
1223     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1224     return Ctx.TTI.getArithmeticInstrCost(
1225         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind,
1226         {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1227   }
1228 
1229   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1230   if (!ScalarCond)
1231     CondTy = VectorType::get(CondTy, VF);
1232 
1233   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1234   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1235     Pred = Cmp->getPredicate();
1236   return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred,
1237                                     CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1238                                     {TTI::OK_AnyValue, TTI::OP_None}, SI);
1239 }
1240 
1241 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1242     const FastMathFlags &FMF) {
1243   AllowReassoc = FMF.allowReassoc();
1244   NoNaNs = FMF.noNaNs();
1245   NoInfs = FMF.noInfs();
1246   NoSignedZeros = FMF.noSignedZeros();
1247   AllowReciprocal = FMF.allowReciprocal();
1248   AllowContract = FMF.allowContract();
1249   ApproxFunc = FMF.approxFunc();
1250 }
1251 
1252 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1253 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1254   switch (OpType) {
1255   case OperationType::Cmp:
1256     O << " " << CmpInst::getPredicateName(getPredicate());
1257     break;
1258   case OperationType::DisjointOp:
1259     if (DisjointFlags.IsDisjoint)
1260       O << " disjoint";
1261     break;
1262   case OperationType::PossiblyExactOp:
1263     if (ExactFlags.IsExact)
1264       O << " exact";
1265     break;
1266   case OperationType::OverflowingBinOp:
1267     if (WrapFlags.HasNUW)
1268       O << " nuw";
1269     if (WrapFlags.HasNSW)
1270       O << " nsw";
1271     break;
1272   case OperationType::FPMathOp:
1273     getFastMathFlags().print(O);
1274     break;
1275   case OperationType::GEPOp:
1276     if (GEPFlags.isInBounds())
1277       O << " inbounds";
1278     else if (GEPFlags.hasNoUnsignedSignedWrap())
1279       O << " nusw";
1280     if (GEPFlags.hasNoUnsignedWrap())
1281       O << " nuw";
1282     break;
1283   case OperationType::NonNegOp:
1284     if (NonNegFlags.NonNeg)
1285       O << " nneg";
1286     break;
1287   case OperationType::Other:
1288     break;
1289   }
1290   if (getNumOperands() > 0)
1291     O << " ";
1292 }
1293 #endif
1294 
1295 void VPWidenRecipe::execute(VPTransformState &State) {
1296   State.setDebugLocFrom(getDebugLoc());
1297   auto &Builder = State.Builder;
1298   switch (Opcode) {
1299   case Instruction::Call:
1300   case Instruction::Br:
1301   case Instruction::PHI:
1302   case Instruction::GetElementPtr:
1303   case Instruction::Select:
1304     llvm_unreachable("This instruction is handled by a different recipe.");
1305   case Instruction::UDiv:
1306   case Instruction::SDiv:
1307   case Instruction::SRem:
1308   case Instruction::URem:
1309   case Instruction::Add:
1310   case Instruction::FAdd:
1311   case Instruction::Sub:
1312   case Instruction::FSub:
1313   case Instruction::FNeg:
1314   case Instruction::Mul:
1315   case Instruction::FMul:
1316   case Instruction::FDiv:
1317   case Instruction::FRem:
1318   case Instruction::Shl:
1319   case Instruction::LShr:
1320   case Instruction::AShr:
1321   case Instruction::And:
1322   case Instruction::Or:
1323   case Instruction::Xor: {
1324     // Just widen unops and binops.
1325     SmallVector<Value *, 2> Ops;
1326     for (VPValue *VPOp : operands())
1327       Ops.push_back(State.get(VPOp));
1328 
1329     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1330 
1331     if (auto *VecOp = dyn_cast<Instruction>(V))
1332       setFlags(VecOp);
1333 
1334     // Use this vector value for all users of the original instruction.
1335     State.set(this, V);
1336     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1337     break;
1338   }
1339   case Instruction::Freeze: {
1340     Value *Op = State.get(getOperand(0));
1341 
1342     Value *Freeze = Builder.CreateFreeze(Op);
1343     State.set(this, Freeze);
1344     break;
1345   }
1346   case Instruction::ICmp:
1347   case Instruction::FCmp: {
1348     // Widen compares. Generate vector compares.
1349     bool FCmp = Opcode == Instruction::FCmp;
1350     Value *A = State.get(getOperand(0));
1351     Value *B = State.get(getOperand(1));
1352     Value *C = nullptr;
1353     if (FCmp) {
1354       // Propagate fast math flags.
1355       IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1356       if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1357         Builder.setFastMathFlags(I->getFastMathFlags());
1358       C = Builder.CreateFCmp(getPredicate(), A, B);
1359     } else {
1360       C = Builder.CreateICmp(getPredicate(), A, B);
1361     }
1362     State.set(this, C);
1363     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1364     break;
1365   }
1366   default:
1367     // This instruction is not vectorized by simple widening.
1368     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1369                       << Instruction::getOpcodeName(Opcode));
1370     llvm_unreachable("Unhandled instruction!");
1371   } // end of switch.
1372 
1373 #if !defined(NDEBUG)
1374   // Verify that VPlan type inference results agree with the type of the
1375   // generated values.
1376   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1377              State.get(this)->getType() &&
1378          "inferred type and type from generated instructions do not match");
1379 #endif
1380 }
1381 
1382 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1383                                            VPCostContext &Ctx) const {
1384   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1385   switch (Opcode) {
1386   case Instruction::FNeg: {
1387     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
1388     return Ctx.TTI.getArithmeticInstrCost(
1389         Opcode, VectorTy, CostKind,
1390         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1391         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1392   }
1393 
1394   case Instruction::UDiv:
1395   case Instruction::SDiv:
1396   case Instruction::SRem:
1397   case Instruction::URem:
1398     // More complex computation, let the legacy cost-model handle this for now.
1399     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1400   case Instruction::Add:
1401   case Instruction::FAdd:
1402   case Instruction::Sub:
1403   case Instruction::FSub:
1404   case Instruction::Mul:
1405   case Instruction::FMul:
1406   case Instruction::FDiv:
1407   case Instruction::FRem:
1408   case Instruction::Shl:
1409   case Instruction::LShr:
1410   case Instruction::AShr:
1411   case Instruction::And:
1412   case Instruction::Or:
1413   case Instruction::Xor: {
1414     VPValue *RHS = getOperand(1);
1415     // Certain instructions can be cheaper to vectorize if they have a constant
1416     // second vector operand. One example of this are shifts on x86.
1417     TargetTransformInfo::OperandValueInfo RHSInfo = {
1418         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1419     if (RHS->isLiveIn())
1420       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1421 
1422     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1423         getOperand(1)->isDefinedOutsideLoopRegions())
1424       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1425     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
1426     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1427 
1428     SmallVector<const Value *, 4> Operands;
1429     if (CtxI)
1430       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1431     return Ctx.TTI.getArithmeticInstrCost(
1432         Opcode, VectorTy, CostKind,
1433         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1434         RHSInfo, Operands, CtxI, &Ctx.TLI);
1435   }
1436   case Instruction::Freeze: {
1437     // This opcode is unknown. Assume that it is the same as 'mul'.
1438     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
1439     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1440   }
1441   case Instruction::ICmp:
1442   case Instruction::FCmp: {
1443     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1444     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1445     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1446                                       CostKind,
1447                                       {TTI::OK_AnyValue, TTI::OP_None},
1448                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1449   }
1450   default:
1451     llvm_unreachable("Unsupported opcode for instruction");
1452   }
1453 }
1454 
1455 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1456   unsigned Opcode = getOpcode();
1457   // TODO: Support other opcodes
1458   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1459     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1460 
1461   State.setDebugLocFrom(getDebugLoc());
1462 
1463   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1464          "VPWidenEVLRecipe should not be used for scalars");
1465 
1466   VPValue *EVL = getEVL();
1467   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1468   IRBuilderBase &BuilderIR = State.Builder;
1469   VectorBuilder Builder(BuilderIR);
1470   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1471 
1472   SmallVector<Value *, 4> Ops;
1473   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1474     VPValue *VPOp = getOperand(I);
1475     Ops.push_back(State.get(VPOp));
1476   }
1477 
1478   Builder.setMask(Mask).setEVL(EVLArg);
1479   Value *VPInst =
1480       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1481   // Currently vp-intrinsics only accept FMF flags.
1482   // TODO: Enable other flags when support is added.
1483   if (isa<FPMathOperator>(VPInst))
1484     setFlags(cast<Instruction>(VPInst));
1485 
1486   State.set(this, VPInst);
1487   State.addMetadata(VPInst,
1488                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1489 }
1490 
1491 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1492 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1493                           VPSlotTracker &SlotTracker) const {
1494   O << Indent << "WIDEN ";
1495   printAsOperand(O, SlotTracker);
1496   O << " = " << Instruction::getOpcodeName(Opcode);
1497   printFlags(O);
1498   printOperands(O, SlotTracker);
1499 }
1500 
1501 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1502                              VPSlotTracker &SlotTracker) const {
1503   O << Indent << "WIDEN ";
1504   printAsOperand(O, SlotTracker);
1505   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1506   printFlags(O);
1507   printOperands(O, SlotTracker);
1508 }
1509 #endif
1510 
1511 void VPWidenCastRecipe::execute(VPTransformState &State) {
1512   State.setDebugLocFrom(getDebugLoc());
1513   auto &Builder = State.Builder;
1514   /// Vectorize casts.
1515   assert(State.VF.isVector() && "Not vectorizing?");
1516   Type *DestTy = VectorType::get(getResultType(), State.VF);
1517   VPValue *Op = getOperand(0);
1518   Value *A = State.get(Op);
1519   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1520   State.set(this, Cast);
1521   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1522   if (auto *CastOp = dyn_cast<Instruction>(Cast))
1523     setFlags(CastOp);
1524 }
1525 
1526 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1527                                                VPCostContext &Ctx) const {
1528   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
1529   // the legacy cost model, including truncates/extends when evaluating a
1530   // reduction in a smaller type.
1531   if (!getUnderlyingValue())
1532     return 0;
1533   // Computes the CastContextHint from a recipes that may access memory.
1534   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1535     if (VF.isScalar())
1536       return TTI::CastContextHint::Normal;
1537     if (isa<VPInterleaveRecipe>(R))
1538       return TTI::CastContextHint::Interleave;
1539     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
1540       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1541                                              : TTI::CastContextHint::Normal;
1542     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1543     if (WidenMemoryRecipe == nullptr)
1544       return TTI::CastContextHint::None;
1545     if (!WidenMemoryRecipe->isConsecutive())
1546       return TTI::CastContextHint::GatherScatter;
1547     if (WidenMemoryRecipe->isReverse())
1548       return TTI::CastContextHint::Reversed;
1549     if (WidenMemoryRecipe->isMasked())
1550       return TTI::CastContextHint::Masked;
1551     return TTI::CastContextHint::Normal;
1552   };
1553 
1554   VPValue *Operand = getOperand(0);
1555   TTI::CastContextHint CCH = TTI::CastContextHint::None;
1556   // For Trunc/FPTrunc, get the context from the only user.
1557   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
1558       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
1559     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
1560       CCH = ComputeCCH(StoreRecipe);
1561   }
1562   // For Z/Sext, get the context from the operand.
1563   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1564            Opcode == Instruction::FPExt) {
1565     if (Operand->isLiveIn())
1566       CCH = TTI::CastContextHint::Normal;
1567     else if (Operand->getDefiningRecipe())
1568       CCH = ComputeCCH(Operand->getDefiningRecipe());
1569   }
1570 
1571   auto *SrcTy =
1572       cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF));
1573   auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
1574   // Arm TTI will use the underlying instruction to determine the cost.
1575   return Ctx.TTI.getCastInstrCost(
1576       Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
1577       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1578 }
1579 
1580 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1581 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1582                               VPSlotTracker &SlotTracker) const {
1583   O << Indent << "WIDEN-CAST ";
1584   printAsOperand(O, SlotTracker);
1585   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1586   printFlags(O);
1587   printOperands(O, SlotTracker);
1588   O << " to " << *getResultType();
1589 }
1590 #endif
1591 
1592 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
1593                                                VPCostContext &Ctx) const {
1594   return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
1595 }
1596 
1597 /// This function adds
1598 /// (0 * Step, 1 * Step, 2 * Step, ...)
1599 /// to each vector element of Val.
1600 /// \p Opcode is relevant for FP induction variable.
1601 static Value *getStepVector(Value *Val, Value *Step,
1602                             Instruction::BinaryOps BinOp, ElementCount VF,
1603                             IRBuilderBase &Builder) {
1604   assert(VF.isVector() && "only vector VFs are supported");
1605 
1606   // Create and check the types.
1607   auto *ValVTy = cast<VectorType>(Val->getType());
1608   ElementCount VLen = ValVTy->getElementCount();
1609 
1610   Type *STy = Val->getType()->getScalarType();
1611   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1612          "Induction Step must be an integer or FP");
1613   assert(Step->getType() == STy && "Step has wrong type");
1614 
1615   SmallVector<Constant *, 8> Indices;
1616 
1617   // Create a vector of consecutive numbers from zero to VF.
1618   VectorType *InitVecValVTy = ValVTy;
1619   if (STy->isFloatingPointTy()) {
1620     Type *InitVecValSTy =
1621         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1622     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1623   }
1624   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1625 
1626   if (STy->isIntegerTy()) {
1627     Step = Builder.CreateVectorSplat(VLen, Step);
1628     assert(Step->getType() == Val->getType() && "Invalid step vec");
1629     // FIXME: The newly created binary instructions should contain nsw/nuw
1630     // flags, which can be found from the original scalar operations.
1631     Step = Builder.CreateMul(InitVec, Step);
1632     return Builder.CreateAdd(Val, Step, "induction");
1633   }
1634 
1635   // Floating point induction.
1636   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1637          "Binary Opcode should be specified for FP induction");
1638   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1639 
1640   Step = Builder.CreateVectorSplat(VLen, Step);
1641   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1642   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1643 }
1644 
1645 /// A helper function that returns an integer or floating-point constant with
1646 /// value C.
1647 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1648   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1649                            : ConstantFP::get(Ty, C);
1650 }
1651 
1652 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1653   assert(!State.Lane && "Int or FP induction being replicated.");
1654 
1655   Value *Start = getStartValue()->getLiveInIRValue();
1656   const InductionDescriptor &ID = getInductionDescriptor();
1657   TruncInst *Trunc = getTruncInst();
1658   IRBuilderBase &Builder = State.Builder;
1659   assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
1660          "Types must match");
1661   assert(State.VF.isVector() && "must have vector VF");
1662 
1663   // The value from the original loop to which we are mapping the new induction
1664   // variable.
1665   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
1666 
1667   // Fast-math-flags propagate from the original induction instruction.
1668   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1669   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1670     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1671 
1672   // Now do the actual transformations, and start with fetching the step value.
1673   Value *Step = State.get(getStepValue(), VPLane(0));
1674 
1675   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1676          "Expected either an induction phi-node or a truncate of it!");
1677 
1678   // Construct the initial value of the vector IV in the vector loop preheader
1679   auto CurrIP = Builder.saveIP();
1680   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1681   Builder.SetInsertPoint(VectorPH->getTerminator());
1682   if (isa<TruncInst>(EntryVal)) {
1683     assert(Start->getType()->isIntegerTy() &&
1684            "Truncation requires an integer type");
1685     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1686     Step = Builder.CreateTrunc(Step, TruncType);
1687     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1688   }
1689 
1690   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1691   Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
1692                                       State.VF, State.Builder);
1693 
1694   // We create vector phi nodes for both integer and floating-point induction
1695   // variables. Here, we determine the kind of arithmetic we will perform.
1696   Instruction::BinaryOps AddOp;
1697   Instruction::BinaryOps MulOp;
1698   if (Step->getType()->isIntegerTy()) {
1699     AddOp = Instruction::Add;
1700     MulOp = Instruction::Mul;
1701   } else {
1702     AddOp = ID.getInductionOpcode();
1703     MulOp = Instruction::FMul;
1704   }
1705 
1706   Value *SplatVF;
1707   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1708     // The recipe has been unrolled. In that case, fetch the splat value for the
1709     // induction increment.
1710     SplatVF = State.get(SplatVFOperand);
1711   } else {
1712     // Multiply the vectorization factor by the step using integer or
1713     // floating-point arithmetic as appropriate.
1714     Type *StepType = Step->getType();
1715     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1716     if (Step->getType()->isFloatingPointTy())
1717       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1718     else
1719       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1720     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1721 
1722     // Create a vector splat to use in the induction update.
1723     SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
1724   }
1725 
1726   Builder.restoreIP(CurrIP);
1727 
1728   // We may need to add the step a number of times, depending on the unroll
1729   // factor. The last of those goes into the PHI.
1730   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1731   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1732   VecInd->setDebugLoc(getDebugLoc());
1733   State.set(this, VecInd);
1734 
1735   Instruction *LastInduction = cast<Instruction>(
1736       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1737   if (isa<TruncInst>(EntryVal))
1738     State.addMetadata(LastInduction, EntryVal);
1739   LastInduction->setDebugLoc(getDebugLoc());
1740 
1741   VecInd->addIncoming(SteppedStart, VectorPH);
1742   // Add induction update using an incorrect block temporarily. The phi node
1743   // will be fixed after VPlan execution. Note that at this point the latch
1744   // block cannot be used, as it does not exist yet.
1745   // TODO: Model increment value in VPlan, by turning the recipe into a
1746   // multi-def and a subclass of VPHeaderPHIRecipe.
1747   VecInd->addIncoming(LastInduction, VectorPH);
1748 }
1749 
1750 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1751 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1752                                           VPSlotTracker &SlotTracker) const {
1753   O << Indent;
1754   printAsOperand(O, SlotTracker);
1755   O << " = WIDEN-INDUCTION  ";
1756   printOperands(O, SlotTracker);
1757 
1758   if (auto *TI = getTruncInst())
1759     O << " (truncated to " << *TI->getType() << ")";
1760 }
1761 #endif
1762 
1763 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1764   // The step may be defined by a recipe in the preheader (e.g. if it requires
1765   // SCEV expansion), but for the canonical induction the step is required to be
1766   // 1, which is represented as live-in.
1767   if (getStepValue()->getDefiningRecipe())
1768     return false;
1769   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1770   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1771   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1772   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1773          getScalarType() == CanIV->getScalarType();
1774 }
1775 
1776 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1777 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1778                               VPSlotTracker &SlotTracker) const {
1779   O << Indent;
1780   printAsOperand(O, SlotTracker);
1781   O << " = DERIVED-IV ";
1782   getStartValue()->printAsOperand(O, SlotTracker);
1783   O << " + ";
1784   getOperand(1)->printAsOperand(O, SlotTracker);
1785   O << " * ";
1786   getStepValue()->printAsOperand(O, SlotTracker);
1787 }
1788 #endif
1789 
1790 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1791   // Fast-math-flags propagate from the original induction instruction.
1792   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1793   if (hasFastMathFlags())
1794     State.Builder.setFastMathFlags(getFastMathFlags());
1795 
1796   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1797   /// variable on which to base the steps, \p Step is the size of the step.
1798 
1799   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1800   Value *Step = State.get(getStepValue(), VPLane(0));
1801   IRBuilderBase &Builder = State.Builder;
1802 
1803   // Ensure step has the same type as that of scalar IV.
1804   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1805   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1806 
1807   // We build scalar steps for both integer and floating-point induction
1808   // variables. Here, we determine the kind of arithmetic we will perform.
1809   Instruction::BinaryOps AddOp;
1810   Instruction::BinaryOps MulOp;
1811   if (BaseIVTy->isIntegerTy()) {
1812     AddOp = Instruction::Add;
1813     MulOp = Instruction::Mul;
1814   } else {
1815     AddOp = InductionOpcode;
1816     MulOp = Instruction::FMul;
1817   }
1818 
1819   // Determine the number of scalars we need to generate for each unroll
1820   // iteration.
1821   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1822   // Compute the scalar steps and save the results in State.
1823   Type *IntStepTy =
1824       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1825   Type *VecIVTy = nullptr;
1826   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1827   if (!FirstLaneOnly && State.VF.isScalable()) {
1828     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1829     UnitStepVec =
1830         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1831     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1832     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1833   }
1834 
1835   unsigned StartLane = 0;
1836   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1837   if (State.Lane) {
1838     StartLane = State.Lane->getKnownLane();
1839     EndLane = StartLane + 1;
1840   }
1841   Value *StartIdx0 =
1842       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1843 
1844   if (!FirstLaneOnly && State.VF.isScalable()) {
1845     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1846     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1847     if (BaseIVTy->isFloatingPointTy())
1848       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1849     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1850     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1851     State.set(this, Add);
1852     // It's useful to record the lane values too for the known minimum number
1853     // of elements so we do those below. This improves the code quality when
1854     // trying to extract the first element, for example.
1855   }
1856 
1857   if (BaseIVTy->isFloatingPointTy())
1858     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1859 
1860   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1861     Value *StartIdx = Builder.CreateBinOp(
1862         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1863     // The step returned by `createStepForVF` is a runtime-evaluated value
1864     // when VF is scalable. Otherwise, it should be folded into a Constant.
1865     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1866            "Expected StartIdx to be folded to a constant when VF is not "
1867            "scalable");
1868     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1869     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1870     State.set(this, Add, VPLane(Lane));
1871   }
1872 }
1873 
1874 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1875 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1876                                   VPSlotTracker &SlotTracker) const {
1877   O << Indent;
1878   printAsOperand(O, SlotTracker);
1879   O << " = SCALAR-STEPS ";
1880   printOperands(O, SlotTracker);
1881 }
1882 #endif
1883 
1884 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1885   assert(State.VF.isVector() && "not widening");
1886   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1887   // Construct a vector GEP by widening the operands of the scalar GEP as
1888   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1889   // results in a vector of pointers when at least one operand of the GEP
1890   // is vector-typed. Thus, to keep the representation compact, we only use
1891   // vector-typed operands for loop-varying values.
1892 
1893   if (areAllOperandsInvariant()) {
1894     // If we are vectorizing, but the GEP has only loop-invariant operands,
1895     // the GEP we build (by only using vector-typed operands for
1896     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1897     // produce a vector of pointers, we need to either arbitrarily pick an
1898     // operand to broadcast, or broadcast a clone of the original GEP.
1899     // Here, we broadcast a clone of the original.
1900     //
1901     // TODO: If at some point we decide to scalarize instructions having
1902     //       loop-invariant operands, this special case will no longer be
1903     //       required. We would add the scalarization decision to
1904     //       collectLoopScalars() and teach getVectorValue() to broadcast
1905     //       the lane-zero scalar value.
1906     SmallVector<Value *> Ops;
1907     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1908       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1909 
1910     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1911                                            ArrayRef(Ops).drop_front(), "",
1912                                            getGEPNoWrapFlags());
1913     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1914     State.set(this, Splat);
1915     State.addMetadata(Splat, GEP);
1916   } else {
1917     // If the GEP has at least one loop-varying operand, we are sure to
1918     // produce a vector of pointers unless VF is scalar.
1919     // The pointer operand of the new GEP. If it's loop-invariant, we
1920     // won't broadcast it.
1921     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
1922                                          : State.get(getOperand(0));
1923 
1924     // Collect all the indices for the new GEP. If any index is
1925     // loop-invariant, we won't broadcast it.
1926     SmallVector<Value *, 4> Indices;
1927     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1928       VPValue *Operand = getOperand(I);
1929       if (isIndexLoopInvariant(I - 1))
1930         Indices.push_back(State.get(Operand, VPLane(0)));
1931       else
1932         Indices.push_back(State.get(Operand));
1933     }
1934 
1935     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1936     // but it should be a vector, otherwise.
1937     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1938                                            Indices, "", getGEPNoWrapFlags());
1939     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1940            "NewGEP is not a pointer vector");
1941     State.set(this, NewGEP);
1942     State.addMetadata(NewGEP, GEP);
1943   }
1944 }
1945 
1946 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1947 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1948                              VPSlotTracker &SlotTracker) const {
1949   O << Indent << "WIDEN-GEP ";
1950   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1951   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1952     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1953 
1954   O << " ";
1955   printAsOperand(O, SlotTracker);
1956   O << " = getelementptr";
1957   printFlags(O);
1958   printOperands(O, SlotTracker);
1959 }
1960 #endif
1961 
1962 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
1963                            unsigned CurrentPart, IRBuilderBase &Builder) {
1964   // Use i32 for the gep index type when the value is constant,
1965   // or query DataLayout for a more suitable index type otherwise.
1966   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
1967   return IsScalable && (IsReverse || CurrentPart > 0)
1968              ? DL.getIndexType(Builder.getPtrTy(0))
1969              : Builder.getInt32Ty();
1970 }
1971 
1972 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
1973   auto &Builder = State.Builder;
1974   State.setDebugLocFrom(getDebugLoc());
1975   unsigned CurrentPart = getUnrollPart(*this);
1976   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
1977                                 CurrentPart, Builder);
1978 
1979   // The wide store needs to start at the last vector element.
1980   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
1981   if (IndexTy != RunTimeVF->getType())
1982     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
1983   // NumElt = -CurrentPart * RunTimeVF
1984   Value *NumElt = Builder.CreateMul(
1985       ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
1986   // LastLane = 1 - RunTimeVF
1987   Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1988   Value *Ptr = State.get(getOperand(0), VPLane(0));
1989   Value *ResultPtr =
1990       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
1991   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
1992                                 getGEPNoWrapFlags());
1993 
1994   State.set(this, ResultPtr, /*IsScalar*/ true);
1995 }
1996 
1997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1998 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1999                                          VPSlotTracker &SlotTracker) const {
2000   O << Indent;
2001   printAsOperand(O, SlotTracker);
2002   O << " = reverse-vector-pointer";
2003   printFlags(O);
2004   O << " ";
2005   printOperands(O, SlotTracker);
2006 }
2007 #endif
2008 
2009 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2010   auto &Builder = State.Builder;
2011   State.setDebugLocFrom(getDebugLoc());
2012   unsigned CurrentPart = getUnrollPart(*this);
2013   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2014                                 CurrentPart, Builder);
2015   Value *Ptr = State.get(getOperand(0), VPLane(0));
2016 
2017   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2018   Value *ResultPtr =
2019       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2020 
2021   State.set(this, ResultPtr, /*IsScalar*/ true);
2022 }
2023 
2024 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2025 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2026                                   VPSlotTracker &SlotTracker) const {
2027   O << Indent;
2028   printAsOperand(O, SlotTracker);
2029   O << " = vector-pointer ";
2030 
2031   printOperands(O, SlotTracker);
2032 }
2033 #endif
2034 
2035 void VPBlendRecipe::execute(VPTransformState &State) {
2036   assert(isNormalized() && "Expected blend to be normalized!");
2037   State.setDebugLocFrom(getDebugLoc());
2038   // We know that all PHIs in non-header blocks are converted into
2039   // selects, so we don't have to worry about the insertion order and we
2040   // can just use the builder.
2041   // At this point we generate the predication tree. There may be
2042   // duplications since this is a simple recursive scan, but future
2043   // optimizations will clean it up.
2044 
2045   unsigned NumIncoming = getNumIncomingValues();
2046 
2047   // Generate a sequence of selects of the form:
2048   // SELECT(Mask3, In3,
2049   //        SELECT(Mask2, In2,
2050   //               SELECT(Mask1, In1,
2051   //                      In0)))
2052   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2053   // are essentially undef are taken from In0.
2054   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2055   Value *Result = nullptr;
2056   for (unsigned In = 0; In < NumIncoming; ++In) {
2057     // We might have single edge PHIs (blocks) - use an identity
2058     // 'select' for the first PHI operand.
2059     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2060     if (In == 0)
2061       Result = In0; // Initialize with the first incoming value.
2062     else {
2063       // Select between the current value and the previous incoming edge
2064       // based on the incoming mask.
2065       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2066       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2067     }
2068   }
2069   State.set(this, Result, OnlyFirstLaneUsed);
2070 }
2071 
2072 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2073                                            VPCostContext &Ctx) const {
2074   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2075 
2076   // Handle cases where only the first lane is used the same way as the legacy
2077   // cost model.
2078   if (vputils::onlyFirstLaneUsed(this))
2079     return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
2080 
2081   Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
2082   Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2083   return (getNumIncomingValues() - 1) *
2084          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2085                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
2086 }
2087 
2088 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2089 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2090                           VPSlotTracker &SlotTracker) const {
2091   O << Indent << "BLEND ";
2092   printAsOperand(O, SlotTracker);
2093   O << " =";
2094   if (getNumIncomingValues() == 1) {
2095     // Not a User of any mask: not really blending, this is a
2096     // single-predecessor phi.
2097     O << " ";
2098     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2099   } else {
2100     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2101       O << " ";
2102       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2103       if (I == 0)
2104         continue;
2105       O << "/";
2106       getMask(I)->printAsOperand(O, SlotTracker);
2107     }
2108   }
2109 }
2110 #endif
2111 
2112 void VPReductionRecipe::execute(VPTransformState &State) {
2113   assert(!State.Lane && "Reduction being replicated.");
2114   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2115   RecurKind Kind = RdxDesc.getRecurrenceKind();
2116   // Propagate the fast-math flags carried by the underlying instruction.
2117   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2118   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2119   Value *NewVecOp = State.get(getVecOp());
2120   if (VPValue *Cond = getCondOp()) {
2121     Value *NewCond = State.get(Cond, State.VF.isScalar());
2122     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2123     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2124 
2125     Value *Start;
2126     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
2127       Start = RdxDesc.getRecurrenceStartValue();
2128     else
2129       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
2130                                           RdxDesc.getFastMathFlags());
2131     if (State.VF.isVector())
2132       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2133 
2134     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2135     NewVecOp = Select;
2136   }
2137   Value *NewRed;
2138   Value *NextInChain;
2139   if (IsOrdered) {
2140     if (State.VF.isVector())
2141       NewRed =
2142           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
2143     else
2144       NewRed = State.Builder.CreateBinOp(
2145           (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp);
2146     PrevInChain = NewRed;
2147     NextInChain = NewRed;
2148   } else {
2149     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2150     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
2151     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2152       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
2153                                    NewRed, PrevInChain);
2154     else
2155       NextInChain = State.Builder.CreateBinOp(
2156           (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain);
2157   }
2158   State.set(this, NextInChain, /*IsScalar*/ true);
2159 }
2160 
2161 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2162   assert(!State.Lane && "Reduction being replicated.");
2163 
2164   auto &Builder = State.Builder;
2165   // Propagate the fast-math flags carried by the underlying instruction.
2166   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2167   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2168   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2169 
2170   RecurKind Kind = RdxDesc.getRecurrenceKind();
2171   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2172   Value *VecOp = State.get(getVecOp());
2173   Value *EVL = State.get(getEVL(), VPLane(0));
2174 
2175   VectorBuilder VBuilder(Builder);
2176   VBuilder.setEVL(EVL);
2177   Value *Mask;
2178   // TODO: move the all-true mask generation into VectorBuilder.
2179   if (VPValue *CondOp = getCondOp())
2180     Mask = State.get(CondOp);
2181   else
2182     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2183   VBuilder.setMask(Mask);
2184 
2185   Value *NewRed;
2186   if (isOrdered()) {
2187     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2188   } else {
2189     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2190     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2191       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2192     else
2193       NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(),
2194                                    NewRed, Prev);
2195   }
2196   State.set(this, NewRed, /*IsScalar*/ true);
2197 }
2198 
2199 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2200                                                VPCostContext &Ctx) const {
2201   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
2202   Type *ElementTy = Ctx.Types.inferScalarType(this);
2203   auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF));
2204   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2205   unsigned Opcode = RdxDesc.getOpcode();
2206 
2207   // TODO: Support any-of and in-loop reductions.
2208   assert(
2209       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2210        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2211       "Any-of reduction not implemented in VPlan-based cost model currently.");
2212   assert(
2213       (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
2214        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2215       "In-loop reduction not implemented in VPlan-based cost model currently.");
2216 
2217   assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() &&
2218          "Inferred type and recurrence type mismatch.");
2219 
2220   // Cost = Reduction cost + BinOp cost
2221   InstructionCost Cost =
2222       Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
2223   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2224     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2225     return Cost + Ctx.TTI.getMinMaxReductionCost(
2226                       Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
2227   }
2228 
2229   return Cost + Ctx.TTI.getArithmeticReductionCost(
2230                     Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
2231 }
2232 
2233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2234 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2235                               VPSlotTracker &SlotTracker) const {
2236   O << Indent << "REDUCE ";
2237   printAsOperand(O, SlotTracker);
2238   O << " = ";
2239   getChainOp()->printAsOperand(O, SlotTracker);
2240   O << " +";
2241   if (isa<FPMathOperator>(getUnderlyingInstr()))
2242     O << getUnderlyingInstr()->getFastMathFlags();
2243   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2244   getVecOp()->printAsOperand(O, SlotTracker);
2245   if (isConditional()) {
2246     O << ", ";
2247     getCondOp()->printAsOperand(O, SlotTracker);
2248   }
2249   O << ")";
2250   if (RdxDesc.IntermediateStore)
2251     O << " (with final reduction value stored in invariant address sank "
2252          "outside of loop)";
2253 }
2254 
2255 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2256                                  VPSlotTracker &SlotTracker) const {
2257   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2258   O << Indent << "REDUCE ";
2259   printAsOperand(O, SlotTracker);
2260   O << " = ";
2261   getChainOp()->printAsOperand(O, SlotTracker);
2262   O << " +";
2263   if (isa<FPMathOperator>(getUnderlyingInstr()))
2264     O << getUnderlyingInstr()->getFastMathFlags();
2265   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2266   getVecOp()->printAsOperand(O, SlotTracker);
2267   O << ", ";
2268   getEVL()->printAsOperand(O, SlotTracker);
2269   if (isConditional()) {
2270     O << ", ";
2271     getCondOp()->printAsOperand(O, SlotTracker);
2272   }
2273   O << ")";
2274   if (RdxDesc.IntermediateStore)
2275     O << " (with final reduction value stored in invariant address sank "
2276          "outside of loop)";
2277 }
2278 #endif
2279 
2280 bool VPReplicateRecipe::shouldPack() const {
2281   // Find if the recipe is used by a widened recipe via an intervening
2282   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2283   return any_of(users(), [](const VPUser *U) {
2284     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2285       return any_of(PredR->users(), [PredR](const VPUser *U) {
2286         return !U->usesScalars(PredR);
2287       });
2288     return false;
2289   });
2290 }
2291 
2292 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2293                                                VPCostContext &Ctx) const {
2294   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2295   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2296   // transform, avoid computing their cost multiple times for now.
2297   Ctx.SkipCostComputation.insert(UI);
2298   return Ctx.getLegacyCost(UI, VF);
2299 }
2300 
2301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2302 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2303                               VPSlotTracker &SlotTracker) const {
2304   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2305 
2306   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2307     printAsOperand(O, SlotTracker);
2308     O << " = ";
2309   }
2310   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2311     O << "call";
2312     printFlags(O);
2313     O << "@" << CB->getCalledFunction()->getName() << "(";
2314     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2315                     O, [&O, &SlotTracker](VPValue *Op) {
2316                       Op->printAsOperand(O, SlotTracker);
2317                     });
2318     O << ")";
2319   } else {
2320     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2321     printFlags(O);
2322     printOperands(O, SlotTracker);
2323   }
2324 
2325   if (shouldPack())
2326     O << " (S->V)";
2327 }
2328 #endif
2329 
2330 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2331   assert(vputils::onlyFirstLaneUsed(this) &&
2332          "Codegen only implemented for first lane.");
2333   switch (Opcode) {
2334   case Instruction::SExt:
2335   case Instruction::ZExt:
2336   case Instruction::Trunc: {
2337     // Note: SExt/ZExt not used yet.
2338     Value *Op = State.get(getOperand(0), VPLane(0));
2339     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2340   }
2341   default:
2342     llvm_unreachable("opcode not implemented yet");
2343   }
2344 }
2345 
2346 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2347   State.set(this, generate(State), VPLane(0));
2348 }
2349 
2350 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2351 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2352                                 VPSlotTracker &SlotTracker) const {
2353   O << Indent << "SCALAR-CAST ";
2354   printAsOperand(O, SlotTracker);
2355   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2356   printOperands(O, SlotTracker);
2357   O << " to " << *ResultTy;
2358 }
2359 #endif
2360 
2361 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2362   assert(State.Lane && "Branch on Mask works only on single instance.");
2363 
2364   unsigned Lane = State.Lane->getKnownLane();
2365 
2366   Value *ConditionBit = nullptr;
2367   VPValue *BlockInMask = getMask();
2368   if (BlockInMask) {
2369     ConditionBit = State.get(BlockInMask);
2370     if (ConditionBit->getType()->isVectorTy())
2371       ConditionBit = State.Builder.CreateExtractElement(
2372           ConditionBit, State.Builder.getInt32(Lane));
2373   } else // Block in mask is all-one.
2374     ConditionBit = State.Builder.getTrue();
2375 
2376   // Replace the temporary unreachable terminator with a new conditional branch,
2377   // whose two destinations will be set later when they are created.
2378   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2379   assert(isa<UnreachableInst>(CurrentTerminator) &&
2380          "Expected to replace unreachable terminator with conditional branch.");
2381   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2382   CondBr->setSuccessor(0, nullptr);
2383   ReplaceInstWithInst(CurrentTerminator, CondBr);
2384 }
2385 
2386 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2387                                                   VPCostContext &Ctx) const {
2388   // The legacy cost model doesn't assign costs to branches for individual
2389   // replicate regions. Match the current behavior in the VPlan cost model for
2390   // now.
2391   return 0;
2392 }
2393 
2394 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2395   State.setDebugLocFrom(getDebugLoc());
2396   assert(State.Lane && "Predicated instruction PHI works per instance.");
2397   Instruction *ScalarPredInst =
2398       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2399   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2400   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2401   assert(PredicatingBB && "Predicated block has no single predecessor.");
2402   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2403          "operand must be VPReplicateRecipe");
2404 
2405   // By current pack/unpack logic we need to generate only a single phi node: if
2406   // a vector value for the predicated instruction exists at this point it means
2407   // the instruction has vector users only, and a phi for the vector value is
2408   // needed. In this case the recipe of the predicated instruction is marked to
2409   // also do that packing, thereby "hoisting" the insert-element sequence.
2410   // Otherwise, a phi node for the scalar value is needed.
2411   if (State.hasVectorValue(getOperand(0))) {
2412     Value *VectorValue = State.get(getOperand(0));
2413     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2414     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2415     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2416     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2417     if (State.hasVectorValue(this))
2418       State.reset(this, VPhi);
2419     else
2420       State.set(this, VPhi);
2421     // NOTE: Currently we need to update the value of the operand, so the next
2422     // predicated iteration inserts its generated value in the correct vector.
2423     State.reset(getOperand(0), VPhi);
2424   } else {
2425     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
2426       return;
2427 
2428     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2429     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2430     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2431                      PredicatingBB);
2432     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2433     if (State.hasScalarValue(this, *State.Lane))
2434       State.reset(this, Phi, *State.Lane);
2435     else
2436       State.set(this, Phi, *State.Lane);
2437     // NOTE: Currently we need to update the value of the operand, so the next
2438     // predicated iteration inserts its generated value in the correct vector.
2439     State.reset(getOperand(0), Phi, *State.Lane);
2440   }
2441 }
2442 
2443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2444 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2445                                 VPSlotTracker &SlotTracker) const {
2446   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2447   printAsOperand(O, SlotTracker);
2448   O << " = ";
2449   printOperands(O, SlotTracker);
2450 }
2451 #endif
2452 
2453 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2454                                                  VPCostContext &Ctx) const {
2455   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2456   const Align Alignment =
2457       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2458   unsigned AS =
2459       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2460   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2461 
2462   if (!Consecutive) {
2463     // TODO: Using the original IR may not be accurate.
2464     // Currently, ARM will use the underlying IR to calculate gather/scatter
2465     // instruction cost.
2466     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2467     assert(!Reverse &&
2468            "Inconsecutive memory access should not have the order.");
2469     return Ctx.TTI.getAddressComputationCost(Ty) +
2470            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2471                                           IsMasked, Alignment, CostKind,
2472                                           &Ingredient);
2473   }
2474 
2475   InstructionCost Cost = 0;
2476   if (IsMasked) {
2477     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2478                                           AS, CostKind);
2479   } else {
2480     TTI::OperandValueInfo OpInfo =
2481         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2482     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2483                                     CostKind, OpInfo, &Ingredient);
2484   }
2485   if (!Reverse)
2486     return Cost;
2487 
2488   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2489                                         cast<VectorType>(Ty), {}, CostKind, 0);
2490 }
2491 
2492 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2493   auto *LI = cast<LoadInst>(&Ingredient);
2494 
2495   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2496   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2497   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2498   bool CreateGather = !isConsecutive();
2499 
2500   auto &Builder = State.Builder;
2501   State.setDebugLocFrom(getDebugLoc());
2502   Value *Mask = nullptr;
2503   if (auto *VPMask = getMask()) {
2504     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2505     // of a null all-one mask is a null mask.
2506     Mask = State.get(VPMask);
2507     if (isReverse())
2508       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2509   }
2510 
2511   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2512   Value *NewLI;
2513   if (CreateGather) {
2514     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2515                                        "wide.masked.gather");
2516   } else if (Mask) {
2517     NewLI =
2518         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2519                                  PoisonValue::get(DataTy), "wide.masked.load");
2520   } else {
2521     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2522   }
2523   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2524   State.addMetadata(NewLI, LI);
2525   if (Reverse)
2526     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2527   State.set(this, NewLI);
2528 }
2529 
2530 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2531 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2532                               VPSlotTracker &SlotTracker) const {
2533   O << Indent << "WIDEN ";
2534   printAsOperand(O, SlotTracker);
2535   O << " = load ";
2536   printOperands(O, SlotTracker);
2537 }
2538 #endif
2539 
2540 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2541 /// dependence w/o affecting the result.
2542 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2543                                      Value *EVL, const Twine &Name) {
2544   VectorType *ValTy = cast<VectorType>(Operand->getType());
2545   Value *AllTrueMask =
2546       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2547   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2548                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2549 }
2550 
2551 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2552   auto *LI = cast<LoadInst>(&Ingredient);
2553 
2554   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2555   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2556   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2557   bool CreateGather = !isConsecutive();
2558 
2559   auto &Builder = State.Builder;
2560   State.setDebugLocFrom(getDebugLoc());
2561   CallInst *NewLI;
2562   Value *EVL = State.get(getEVL(), VPLane(0));
2563   Value *Addr = State.get(getAddr(), !CreateGather);
2564   Value *Mask = nullptr;
2565   if (VPValue *VPMask = getMask()) {
2566     Mask = State.get(VPMask);
2567     if (isReverse())
2568       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2569   } else {
2570     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2571   }
2572 
2573   if (CreateGather) {
2574     NewLI =
2575         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2576                                 nullptr, "wide.masked.gather");
2577   } else {
2578     VectorBuilder VBuilder(Builder);
2579     VBuilder.setEVL(EVL).setMask(Mask);
2580     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2581         Instruction::Load, DataTy, Addr, "vp.op.load"));
2582   }
2583   NewLI->addParamAttr(
2584       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2585   State.addMetadata(NewLI, LI);
2586   Instruction *Res = NewLI;
2587   if (isReverse())
2588     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2589   State.set(this, Res);
2590 }
2591 
2592 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2593                                                   VPCostContext &Ctx) const {
2594   if (!Consecutive || IsMasked)
2595     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2596 
2597   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2598   // here because the EVL recipes using EVL to replace the tail mask. But in the
2599   // legacy model, it will always calculate the cost of mask.
2600   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2601   // don't need to compare to the legacy cost model.
2602   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2603   const Align Alignment =
2604       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2605   unsigned AS =
2606       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2607   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2608   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2609       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2610   if (!Reverse)
2611     return Cost;
2612 
2613   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2614                                        cast<VectorType>(Ty), {}, CostKind, 0);
2615 }
2616 
2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2618 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2619                                  VPSlotTracker &SlotTracker) const {
2620   O << Indent << "WIDEN ";
2621   printAsOperand(O, SlotTracker);
2622   O << " = vp.load ";
2623   printOperands(O, SlotTracker);
2624 }
2625 #endif
2626 
2627 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2628   auto *SI = cast<StoreInst>(&Ingredient);
2629 
2630   VPValue *StoredVPValue = getStoredValue();
2631   bool CreateScatter = !isConsecutive();
2632   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2633 
2634   auto &Builder = State.Builder;
2635   State.setDebugLocFrom(getDebugLoc());
2636 
2637   Value *Mask = nullptr;
2638   if (auto *VPMask = getMask()) {
2639     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2640     // of a null all-one mask is a null mask.
2641     Mask = State.get(VPMask);
2642     if (isReverse())
2643       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2644   }
2645 
2646   Value *StoredVal = State.get(StoredVPValue);
2647   if (isReverse()) {
2648     // If we store to reverse consecutive memory locations, then we need
2649     // to reverse the order of elements in the stored value.
2650     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2651     // We don't want to update the value in the map as it might be used in
2652     // another expression. So don't call resetVectorValue(StoredVal).
2653   }
2654   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2655   Instruction *NewSI = nullptr;
2656   if (CreateScatter)
2657     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2658   else if (Mask)
2659     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2660   else
2661     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2662   State.addMetadata(NewSI, SI);
2663 }
2664 
2665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2666 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2667                                VPSlotTracker &SlotTracker) const {
2668   O << Indent << "WIDEN store ";
2669   printOperands(O, SlotTracker);
2670 }
2671 #endif
2672 
2673 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2674   auto *SI = cast<StoreInst>(&Ingredient);
2675 
2676   VPValue *StoredValue = getStoredValue();
2677   bool CreateScatter = !isConsecutive();
2678   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2679 
2680   auto &Builder = State.Builder;
2681   State.setDebugLocFrom(getDebugLoc());
2682 
2683   CallInst *NewSI = nullptr;
2684   Value *StoredVal = State.get(StoredValue);
2685   Value *EVL = State.get(getEVL(), VPLane(0));
2686   if (isReverse())
2687     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2688   Value *Mask = nullptr;
2689   if (VPValue *VPMask = getMask()) {
2690     Mask = State.get(VPMask);
2691     if (isReverse())
2692       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2693   } else {
2694     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2695   }
2696   Value *Addr = State.get(getAddr(), !CreateScatter);
2697   if (CreateScatter) {
2698     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2699                                     Intrinsic::vp_scatter,
2700                                     {StoredVal, Addr, Mask, EVL});
2701   } else {
2702     VectorBuilder VBuilder(Builder);
2703     VBuilder.setEVL(EVL).setMask(Mask);
2704     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2705         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2706         {StoredVal, Addr}));
2707   }
2708   NewSI->addParamAttr(
2709       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2710   State.addMetadata(NewSI, SI);
2711 }
2712 
2713 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2714                                                    VPCostContext &Ctx) const {
2715   if (!Consecutive || IsMasked)
2716     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2717 
2718   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2719   // here because the EVL recipes using EVL to replace the tail mask. But in the
2720   // legacy model, it will always calculate the cost of mask.
2721   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2722   // don't need to compare to the legacy cost model.
2723   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2724   const Align Alignment =
2725       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2726   unsigned AS =
2727       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2728   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2729   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2730       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2731   if (!Reverse)
2732     return Cost;
2733 
2734   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2735                                        cast<VectorType>(Ty), {}, CostKind, 0);
2736 }
2737 
2738 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2739 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2740                                   VPSlotTracker &SlotTracker) const {
2741   O << Indent << "WIDEN vp.store ";
2742   printOperands(O, SlotTracker);
2743 }
2744 #endif
2745 
2746 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2747                                      VectorType *DstVTy, const DataLayout &DL) {
2748   // Verify that V is a vector type with same number of elements as DstVTy.
2749   auto VF = DstVTy->getElementCount();
2750   auto *SrcVecTy = cast<VectorType>(V->getType());
2751   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2752   Type *SrcElemTy = SrcVecTy->getElementType();
2753   Type *DstElemTy = DstVTy->getElementType();
2754   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2755          "Vector elements must have same size");
2756 
2757   // Do a direct cast if element types are castable.
2758   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2759     return Builder.CreateBitOrPointerCast(V, DstVTy);
2760   }
2761   // V cannot be directly casted to desired vector type.
2762   // May happen when V is a floating point vector but DstVTy is a vector of
2763   // pointers or vice-versa. Handle this using a two-step bitcast using an
2764   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2765   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2766          "Only one type should be a pointer type");
2767   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2768          "Only one type should be a floating point type");
2769   Type *IntTy =
2770       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2771   auto *VecIntTy = VectorType::get(IntTy, VF);
2772   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2773   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2774 }
2775 
2776 /// Return a vector containing interleaved elements from multiple
2777 /// smaller input vectors.
2778 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2779                                 const Twine &Name) {
2780   unsigned Factor = Vals.size();
2781   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2782 
2783   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2784 #ifndef NDEBUG
2785   for (Value *Val : Vals)
2786     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2787 #endif
2788 
2789   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2790   // must use intrinsics to interleave.
2791   if (VecTy->isScalableTy()) {
2792     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2793     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2794                                    Vals,
2795                                    /*FMFSource=*/nullptr, Name);
2796   }
2797 
2798   // Fixed length. Start by concatenating all vectors into a wide vector.
2799   Value *WideVec = concatenateVectors(Builder, Vals);
2800 
2801   // Interleave the elements into the wide vector.
2802   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2803   return Builder.CreateShuffleVector(
2804       WideVec, createInterleaveMask(NumElts, Factor), Name);
2805 }
2806 
2807 // Try to vectorize the interleave group that \p Instr belongs to.
2808 //
2809 // E.g. Translate following interleaved load group (factor = 3):
2810 //   for (i = 0; i < N; i+=3) {
2811 //     R = Pic[i];             // Member of index 0
2812 //     G = Pic[i+1];           // Member of index 1
2813 //     B = Pic[i+2];           // Member of index 2
2814 //     ... // do something to R, G, B
2815 //   }
2816 // To:
2817 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2818 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2819 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2820 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2821 //
2822 // Or translate following interleaved store group (factor = 3):
2823 //   for (i = 0; i < N; i+=3) {
2824 //     ... do something to R, G, B
2825 //     Pic[i]   = R;           // Member of index 0
2826 //     Pic[i+1] = G;           // Member of index 1
2827 //     Pic[i+2] = B;           // Member of index 2
2828 //   }
2829 // To:
2830 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2831 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2832 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2833 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2834 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2835 void VPInterleaveRecipe::execute(VPTransformState &State) {
2836   assert(!State.Lane && "Interleave group being replicated.");
2837   const InterleaveGroup<Instruction> *Group = IG;
2838   Instruction *Instr = Group->getInsertPos();
2839 
2840   // Prepare for the vector type of the interleaved load/store.
2841   Type *ScalarTy = getLoadStoreType(Instr);
2842   unsigned InterleaveFactor = Group->getFactor();
2843   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2844 
2845   // TODO: extend the masked interleaved-group support to reversed access.
2846   VPValue *BlockInMask = getMask();
2847   assert((!BlockInMask || !Group->isReverse()) &&
2848          "Reversed masked interleave-group not supported.");
2849 
2850   VPValue *Addr = getAddr();
2851   Value *ResAddr = State.get(Addr, VPLane(0));
2852   if (auto *I = dyn_cast<Instruction>(ResAddr))
2853     State.setDebugLocFrom(I->getDebugLoc());
2854 
2855   // If the group is reverse, adjust the index to refer to the last vector lane
2856   // instead of the first. We adjust the index from the first vector lane,
2857   // rather than directly getting the pointer for lane VF - 1, because the
2858   // pointer operand of the interleaved access is supposed to be uniform.
2859   if (Group->isReverse()) {
2860     Value *RuntimeVF =
2861         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2862     Value *Index =
2863         State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2864     Index = State.Builder.CreateMul(Index,
2865                                     State.Builder.getInt32(Group->getFactor()));
2866     Index = State.Builder.CreateNeg(Index);
2867 
2868     bool InBounds = false;
2869     if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2870       InBounds = Gep->isInBounds();
2871     ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
2872   }
2873 
2874   State.setDebugLocFrom(Instr->getDebugLoc());
2875   Value *PoisonVec = PoisonValue::get(VecTy);
2876 
2877   auto CreateGroupMask = [&BlockInMask, &State,
2878                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2879     if (State.VF.isScalable()) {
2880       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2881       assert(InterleaveFactor == 2 &&
2882              "Unsupported deinterleave factor for scalable vectors");
2883       auto *ResBlockInMask = State.get(BlockInMask);
2884       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2885       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2886                                      State.VF.getKnownMinValue() * 2, true);
2887       return State.Builder.CreateIntrinsic(
2888           MaskTy, Intrinsic::vector_interleave2, Ops,
2889           /*FMFSource=*/nullptr, "interleaved.mask");
2890     }
2891 
2892     if (!BlockInMask)
2893       return MaskForGaps;
2894 
2895     Value *ResBlockInMask = State.get(BlockInMask);
2896     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2897         ResBlockInMask,
2898         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2899         "interleaved.mask");
2900     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2901                                                    ShuffledMask, MaskForGaps)
2902                        : ShuffledMask;
2903   };
2904 
2905   const DataLayout &DL = Instr->getDataLayout();
2906   // Vectorize the interleaved load group.
2907   if (isa<LoadInst>(Instr)) {
2908     Value *MaskForGaps = nullptr;
2909     if (NeedsMaskForGaps) {
2910       MaskForGaps = createBitMaskForGaps(State.Builder,
2911                                          State.VF.getKnownMinValue(), *Group);
2912       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2913     }
2914 
2915     Instruction *NewLoad;
2916     if (BlockInMask || MaskForGaps) {
2917       Value *GroupMask = CreateGroupMask(MaskForGaps);
2918       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2919                                                Group->getAlign(), GroupMask,
2920                                                PoisonVec, "wide.masked.vec");
2921     } else
2922       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2923                                                 Group->getAlign(), "wide.vec");
2924     Group->addMetadata(NewLoad);
2925 
2926     ArrayRef<VPValue *> VPDefs = definedValues();
2927     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2928     if (VecTy->isScalableTy()) {
2929       assert(InterleaveFactor == 2 &&
2930              "Unsupported deinterleave factor for scalable vectors");
2931 
2932         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2933         // so must use intrinsics to deinterleave.
2934       Value *DI = State.Builder.CreateIntrinsic(
2935           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2936           /*FMFSource=*/nullptr, "strided.vec");
2937       unsigned J = 0;
2938       for (unsigned I = 0; I < InterleaveFactor; ++I) {
2939         Instruction *Member = Group->getMember(I);
2940 
2941         if (!Member)
2942           continue;
2943 
2944         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2945         // If this member has different type, cast the result type.
2946         if (Member->getType() != ScalarTy) {
2947           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2948           StridedVec =
2949               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2950         }
2951 
2952         if (Group->isReverse())
2953           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2954 
2955         State.set(VPDefs[J], StridedVec);
2956         ++J;
2957       }
2958 
2959       return;
2960     }
2961 
2962     // For each member in the group, shuffle out the appropriate data from the
2963     // wide loads.
2964     unsigned J = 0;
2965     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2966       Instruction *Member = Group->getMember(I);
2967 
2968       // Skip the gaps in the group.
2969       if (!Member)
2970         continue;
2971 
2972       auto StrideMask =
2973           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2974       Value *StridedVec =
2975           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
2976 
2977       // If this member has different type, cast the result type.
2978       if (Member->getType() != ScalarTy) {
2979         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2980         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2981         StridedVec =
2982             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2983       }
2984 
2985       if (Group->isReverse())
2986         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2987 
2988       State.set(VPDefs[J], StridedVec);
2989       ++J;
2990     }
2991     return;
2992   }
2993 
2994   // The sub vector type for current instruction.
2995   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2996 
2997   // Vectorize the interleaved store group.
2998   Value *MaskForGaps =
2999       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3000   assert((!MaskForGaps || !State.VF.isScalable()) &&
3001          "masking gaps for scalable vectors is not yet supported.");
3002   ArrayRef<VPValue *> StoredValues = getStoredValues();
3003   // Collect the stored vector from each member.
3004   SmallVector<Value *, 4> StoredVecs;
3005   unsigned StoredIdx = 0;
3006   for (unsigned i = 0; i < InterleaveFactor; i++) {
3007     assert((Group->getMember(i) || MaskForGaps) &&
3008            "Fail to get a member from an interleaved store group");
3009     Instruction *Member = Group->getMember(i);
3010 
3011     // Skip the gaps in the group.
3012     if (!Member) {
3013       Value *Undef = PoisonValue::get(SubVT);
3014       StoredVecs.push_back(Undef);
3015       continue;
3016     }
3017 
3018     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3019     ++StoredIdx;
3020 
3021     if (Group->isReverse())
3022       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3023 
3024     // If this member has different type, cast it to a unified type.
3025 
3026     if (StoredVec->getType() != SubVT)
3027       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3028 
3029     StoredVecs.push_back(StoredVec);
3030   }
3031 
3032   // Interleave all the smaller vectors into one wider vector.
3033   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3034   Instruction *NewStoreInstr;
3035   if (BlockInMask || MaskForGaps) {
3036     Value *GroupMask = CreateGroupMask(MaskForGaps);
3037     NewStoreInstr = State.Builder.CreateMaskedStore(
3038         IVec, ResAddr, Group->getAlign(), GroupMask);
3039   } else
3040     NewStoreInstr =
3041         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3042 
3043   Group->addMetadata(NewStoreInstr);
3044 }
3045 
3046 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3047 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3048                                VPSlotTracker &SlotTracker) const {
3049   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3050   IG->getInsertPos()->printAsOperand(O, false);
3051   O << ", ";
3052   getAddr()->printAsOperand(O, SlotTracker);
3053   VPValue *Mask = getMask();
3054   if (Mask) {
3055     O << ", ";
3056     Mask->printAsOperand(O, SlotTracker);
3057   }
3058 
3059   unsigned OpIdx = 0;
3060   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3061     if (!IG->getMember(i))
3062       continue;
3063     if (getNumStoreOperands() > 0) {
3064       O << "\n" << Indent << "  store ";
3065       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3066       O << " to index " << i;
3067     } else {
3068       O << "\n" << Indent << "  ";
3069       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3070       O << " = load from index " << i;
3071     }
3072     ++OpIdx;
3073   }
3074 }
3075 #endif
3076 
3077 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3078                                                 VPCostContext &Ctx) const {
3079   Instruction *InsertPos = getInsertPos();
3080   // Find the VPValue index of the interleave group. We need to skip gaps.
3081   unsigned InsertPosIdx = 0;
3082   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3083     if (auto *Member = IG->getMember(Idx)) {
3084       if (Member == InsertPos)
3085         break;
3086       InsertPosIdx++;
3087     }
3088   Type *ValTy = Ctx.Types.inferScalarType(
3089       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3090                                 : getStoredValues()[InsertPosIdx]);
3091   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
3092   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3093   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3094 
3095   unsigned InterleaveFactor = IG->getFactor();
3096   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3097 
3098   // Holds the indices of existing members in the interleaved group.
3099   SmallVector<unsigned, 4> Indices;
3100   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3101     if (IG->getMember(IF))
3102       Indices.push_back(IF);
3103 
3104   // Calculate the cost of the whole interleaved group.
3105   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3106       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3107       IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps);
3108 
3109   if (!IG->isReverse())
3110     return Cost;
3111 
3112   return Cost + IG->getNumMembers() *
3113                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3114                                            VectorTy, std::nullopt, CostKind, 0);
3115 }
3116 
3117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3118 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3119                                    VPSlotTracker &SlotTracker) const {
3120   O << Indent << "EMIT ";
3121   printAsOperand(O, SlotTracker);
3122   O << " = CANONICAL-INDUCTION ";
3123   printOperands(O, SlotTracker);
3124 }
3125 #endif
3126 
3127 bool VPCanonicalIVPHIRecipe::isCanonical(
3128     InductionDescriptor::InductionKind Kind, VPValue *Start,
3129     VPValue *Step) const {
3130   // Must be an integer induction.
3131   if (Kind != InductionDescriptor::IK_IntInduction)
3132     return false;
3133   // Start must match the start value of this canonical induction.
3134   if (Start != getStartValue())
3135     return false;
3136 
3137   // If the step is defined by a recipe, it is not a ConstantInt.
3138   if (Step->getDefiningRecipe())
3139     return false;
3140 
3141   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
3142   return StepC && StepC->isOne();
3143 }
3144 
3145 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3146   return IsScalarAfterVectorization &&
3147          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3148 }
3149 
3150 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3151   assert(getInductionDescriptor().getKind() ==
3152              InductionDescriptor::IK_PtrInduction &&
3153          "Not a pointer induction according to InductionDescriptor!");
3154   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
3155          "Unexpected type.");
3156   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3157          "Recipe should have been replaced");
3158 
3159   unsigned CurrentPart = getUnrollPart(*this);
3160 
3161   // Build a pointer phi
3162   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3163   Type *ScStValueType = ScalarStartValue->getType();
3164 
3165   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3166   PHINode *NewPointerPhi = nullptr;
3167   if (CurrentPart == 0) {
3168     auto *IVR = cast<VPHeaderPHIRecipe>(&getParent()
3169                                              ->getPlan()
3170                                              ->getVectorLoopRegion()
3171                                              ->getEntryBasicBlock()
3172                                              ->front());
3173     PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
3174     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
3175                                     CanonicalIV->getIterator());
3176     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3177     NewPointerPhi->setDebugLoc(getDebugLoc());
3178   } else {
3179     // The recipe has been unrolled. In that case, fetch the single pointer phi
3180     // shared among all unrolled parts of the recipe.
3181     auto *GEP =
3182         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3183     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3184   }
3185 
3186   // A pointer induction, performed by using a gep
3187   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3188   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3189   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3190   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3191   // Add induction update using an incorrect block temporarily. The phi node
3192   // will be fixed after VPlan execution. Note that at this point the latch
3193   // block cannot be used, as it does not exist yet.
3194   // TODO: Model increment value in VPlan, by turning the recipe into a
3195   // multi-def and a subclass of VPHeaderPHIRecipe.
3196   if (CurrentPart == 0) {
3197     // The recipe represents the first part of the pointer induction. Create the
3198     // GEP to increment the phi across all unrolled parts.
3199     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
3200     Value *NumUnrolledElems =
3201         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
3202 
3203     Value *InductionGEP = GetElementPtrInst::Create(
3204         State.Builder.getInt8Ty(), NewPointerPhi,
3205         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
3206         InductionLoc);
3207 
3208     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3209   }
3210 
3211   // Create actual address geps that use the pointer phi as base and a
3212   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3213   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3214   Value *StartOffsetScalar = State.Builder.CreateMul(
3215       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3216   Value *StartOffset =
3217       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3218   // Create a vector of consecutive numbers from zero to VF.
3219   StartOffset = State.Builder.CreateAdd(
3220       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3221 
3222   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3223          "scalar step must be the same across all parts");
3224   Value *GEP = State.Builder.CreateGEP(
3225       State.Builder.getInt8Ty(), NewPointerPhi,
3226       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3227                                                State.VF, ScalarStepValue)),
3228       "vector.gep");
3229   State.set(this, GEP);
3230 }
3231 
3232 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3233 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3234                                           VPSlotTracker &SlotTracker) const {
3235   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
3236          "unexpected number of operands");
3237   O << Indent << "EMIT ";
3238   printAsOperand(O, SlotTracker);
3239   O << " = WIDEN-POINTER-INDUCTION ";
3240   getStartValue()->printAsOperand(O, SlotTracker);
3241   O << ", ";
3242   getStepValue()->printAsOperand(O, SlotTracker);
3243   if (getNumOperands() == 4) {
3244     O << ", ";
3245     getOperand(2)->printAsOperand(O, SlotTracker);
3246     O << ", ";
3247     getOperand(3)->printAsOperand(O, SlotTracker);
3248   }
3249 }
3250 #endif
3251 
3252 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3253   assert(!State.Lane && "cannot be used in per-lane");
3254   if (State.ExpandedSCEVs.contains(Expr)) {
3255     // SCEV Expr has already been expanded, result must already be set. At the
3256     // moment we have to execute the entry block twice (once before skeleton
3257     // creation to get expanded SCEVs used by the skeleton and once during
3258     // regular VPlan execution).
3259     State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
3260     assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
3261            "Results must match");
3262     return;
3263   }
3264 
3265   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3266   SCEVExpander Exp(SE, DL, "induction");
3267 
3268   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3269                                  &*State.Builder.GetInsertPoint());
3270   State.ExpandedSCEVs[Expr] = Res;
3271   State.set(this, Res, VPLane(0));
3272 }
3273 
3274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3275 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3276                                VPSlotTracker &SlotTracker) const {
3277   O << Indent << "EMIT ";
3278   printAsOperand(O, SlotTracker);
3279   O << " = EXPAND SCEV " << *Expr;
3280 }
3281 #endif
3282 
3283 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3284   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3285   Type *STy = CanonicalIV->getType();
3286   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3287   ElementCount VF = State.VF;
3288   Value *VStart = VF.isScalar()
3289                       ? CanonicalIV
3290                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3291   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3292   if (VF.isVector()) {
3293     VStep = Builder.CreateVectorSplat(VF, VStep);
3294     VStep =
3295         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3296   }
3297   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3298   State.set(this, CanonicalVectorIV);
3299 }
3300 
3301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3302 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3303                                      VPSlotTracker &SlotTracker) const {
3304   O << Indent << "EMIT ";
3305   printAsOperand(O, SlotTracker);
3306   O << " = WIDEN-CANONICAL-INDUCTION ";
3307   printOperands(O, SlotTracker);
3308 }
3309 #endif
3310 
3311 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3312   auto &Builder = State.Builder;
3313   // Create a vector from the initial value.
3314   auto *VectorInit = getStartValue()->getLiveInIRValue();
3315 
3316   Type *VecTy = State.VF.isScalar()
3317                     ? VectorInit->getType()
3318                     : VectorType::get(VectorInit->getType(), State.VF);
3319 
3320   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3321   if (State.VF.isVector()) {
3322     auto *IdxTy = Builder.getInt32Ty();
3323     auto *One = ConstantInt::get(IdxTy, 1);
3324     IRBuilder<>::InsertPointGuard Guard(Builder);
3325     Builder.SetInsertPoint(VectorPH->getTerminator());
3326     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3327     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3328     VectorInit = Builder.CreateInsertElement(
3329         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3330   }
3331 
3332   // Create a phi node for the new recurrence.
3333   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3334   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3335   Phi->addIncoming(VectorInit, VectorPH);
3336   State.set(this, Phi);
3337 }
3338 
3339 InstructionCost
3340 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3341                                              VPCostContext &Ctx) const {
3342   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3343   if (VF.isScalar())
3344   return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
3345 
3346   if (VF.isScalable() && VF.getKnownMinValue() == 1)
3347     return InstructionCost::getInvalid();
3348 
3349   SmallVector<int> Mask(VF.getKnownMinValue());
3350   std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
3351   Type *VectorTy =
3352       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
3353 
3354   return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
3355                                 cast<VectorType>(VectorTy), Mask, CostKind,
3356                                 VF.getKnownMinValue() - 1);
3357 }
3358 
3359 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3360 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3361                                             VPSlotTracker &SlotTracker) const {
3362   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3363   printAsOperand(O, SlotTracker);
3364   O << " = phi ";
3365   printOperands(O, SlotTracker);
3366 }
3367 #endif
3368 
3369 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3370   auto &Builder = State.Builder;
3371 
3372   // Reductions do not have to start at zero. They can start with
3373   // any loop invariant values.
3374   VPValue *StartVPV = getStartValue();
3375   Value *StartV = StartVPV->getLiveInIRValue();
3376 
3377   // In order to support recurrences we need to be able to vectorize Phi nodes.
3378   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3379   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3380   // this value when we vectorize all of the instructions that use the PHI.
3381   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3382   Type *VecTy = ScalarPHI ? StartV->getType()
3383                           : VectorType::get(StartV->getType(), State.VF);
3384 
3385   BasicBlock *HeaderBB = State.CFG.PrevBB;
3386   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3387          "recipe must be in the vector loop header");
3388   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3389   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3390   State.set(this, Phi, IsInLoop);
3391 
3392   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3393 
3394   Value *Iden = nullptr;
3395   RecurKind RK = RdxDesc.getRecurrenceKind();
3396   unsigned CurrentPart = getUnrollPart(*this);
3397 
3398   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3399       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3400     // MinMax and AnyOf reductions have the start value as their identity.
3401     if (ScalarPHI) {
3402       Iden = StartV;
3403     } else {
3404       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3405       Builder.SetInsertPoint(VectorPH->getTerminator());
3406       StartV = Iden = State.get(StartVPV);
3407     }
3408   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
3409     // [I|F]FindLastIV will use a sentinel value to initialize the reduction
3410     // phi or the resume value from the main vector loop when vectorizing the
3411     // epilogue loop. In the exit block, ComputeReductionResult will generate
3412     // checks to verify if the reduction result is the sentinel value. If the
3413     // result is the sentinel value, it will be corrected back to the start
3414     // value.
3415     // TODO: The sentinel value is not always necessary. When the start value is
3416     // a constant, and smaller than the start value of the induction variable,
3417     // the start value can be directly used to initialize the reduction phi.
3418     Iden = StartV;
3419     if (!ScalarPHI) {
3420       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3421       Builder.SetInsertPoint(VectorPH->getTerminator());
3422       StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
3423     }
3424   } else {
3425     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3426                                        RdxDesc.getFastMathFlags());
3427 
3428     if (!ScalarPHI) {
3429       if (CurrentPart == 0) {
3430         // Create start and identity vector values for the reduction in the
3431         // preheader.
3432         // TODO: Introduce recipes in VPlan preheader to create initial values.
3433         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3434         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3435         Builder.SetInsertPoint(VectorPH->getTerminator());
3436         Constant *Zero = Builder.getInt32(0);
3437         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3438       } else {
3439         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3440       }
3441     }
3442   }
3443 
3444   Phi = cast<PHINode>(State.get(this, IsInLoop));
3445   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3446   Phi->addIncoming(StartVal, VectorPH);
3447 }
3448 
3449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3450 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3451                                  VPSlotTracker &SlotTracker) const {
3452   O << Indent << "WIDEN-REDUCTION-PHI ";
3453 
3454   printAsOperand(O, SlotTracker);
3455   O << " = phi ";
3456   printOperands(O, SlotTracker);
3457 }
3458 #endif
3459 
3460 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3461   assert(EnableVPlanNativePath &&
3462          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3463 
3464   Value *Op0 = State.get(getOperand(0));
3465   Type *VecTy = Op0->getType();
3466   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3467   State.set(this, VecPhi);
3468 }
3469 
3470 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3471 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3472                              VPSlotTracker &SlotTracker) const {
3473   O << Indent << "WIDEN-PHI ";
3474 
3475   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3476   // Unless all incoming values are modeled in VPlan  print the original PHI
3477   // directly.
3478   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3479   // values as VPValues.
3480   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3481     O << VPlanIngredient(OriginalPhi);
3482     return;
3483   }
3484 
3485   printAsOperand(O, SlotTracker);
3486   O << " = phi ";
3487   printOperands(O, SlotTracker);
3488 }
3489 #endif
3490 
3491 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3492 // remove VPActiveLaneMaskPHIRecipe.
3493 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3494   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3495   Value *StartMask = State.get(getOperand(0));
3496   PHINode *Phi =
3497       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3498   Phi->addIncoming(StartMask, VectorPH);
3499   Phi->setDebugLoc(getDebugLoc());
3500   State.set(this, Phi);
3501 }
3502 
3503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3504 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3505                                       VPSlotTracker &SlotTracker) const {
3506   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3507 
3508   printAsOperand(O, SlotTracker);
3509   O << " = phi ";
3510   printOperands(O, SlotTracker);
3511 }
3512 #endif
3513 
3514 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3515 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3516                                   VPSlotTracker &SlotTracker) const {
3517   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3518 
3519   printAsOperand(O, SlotTracker);
3520   O << " = phi ";
3521   printOperands(O, SlotTracker);
3522 }
3523 #endif
3524 
3525 void VPScalarPHIRecipe::execute(VPTransformState &State) {
3526   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3527   Value *Start = State.get(getStartValue(), VPLane(0));
3528   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
3529   Phi->addIncoming(Start, VectorPH);
3530   Phi->setDebugLoc(getDebugLoc());
3531   State.set(this, Phi, /*IsScalar=*/true);
3532 }
3533 
3534 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3535 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3536                               VPSlotTracker &SlotTracker) const {
3537   O << Indent << "SCALAR-PHI ";
3538   printAsOperand(O, SlotTracker);
3539   O << " = phi ";
3540   printOperands(O, SlotTracker);
3541 }
3542 #endif
3543