xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision aff1242b8ee8b75d68812337f8e408a1405ab00f)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanPatternMatch.h"
17 #include "VPlanUtils.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/IVDescriptors.h"
22 #include "llvm/IR/BasicBlock.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instruction.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/IR/VectorBuilder.h"
30 #include "llvm/Support/Casting.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
37 #include <cassert>
38 
39 using namespace llvm;
40 
41 using VectorParts = SmallVector<Value *, 2>;
42 
43 namespace llvm {
44 extern cl::opt<bool> EnableVPlanNativePath;
45 }
46 extern cl::opt<unsigned> ForceTargetInstructionCost;
47 
48 #define LV_NAME "loop-vectorize"
49 #define DEBUG_TYPE LV_NAME
50 
51 bool VPRecipeBase::mayWriteToMemory() const {
52   switch (getVPDefID()) {
53   case VPInstructionSC:
54     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
55   case VPInterleaveSC:
56     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
57   case VPWidenStoreEVLSC:
58   case VPWidenStoreSC:
59     return true;
60   case VPReplicateSC:
61     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
62         ->mayWriteToMemory();
63   case VPWidenCallSC:
64     return !cast<VPWidenCallRecipe>(this)
65                 ->getCalledScalarFunction()
66                 ->onlyReadsMemory();
67   case VPWidenIntrinsicSC:
68     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
69   case VPBranchOnMaskSC:
70   case VPScalarIVStepsSC:
71   case VPPredInstPHISC:
72     return false;
73   case VPBlendSC:
74   case VPReductionEVLSC:
75   case VPReductionSC:
76   case VPVectorPointerSC:
77   case VPWidenCanonicalIVSC:
78   case VPWidenCastSC:
79   case VPWidenGEPSC:
80   case VPWidenIntOrFpInductionSC:
81   case VPWidenLoadEVLSC:
82   case VPWidenLoadSC:
83   case VPWidenPHISC:
84   case VPWidenSC:
85   case VPWidenEVLSC:
86   case VPWidenSelectSC: {
87     const Instruction *I =
88         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
89     (void)I;
90     assert((!I || !I->mayWriteToMemory()) &&
91            "underlying instruction may write to memory");
92     return false;
93   }
94   default:
95     return true;
96   }
97 }
98 
99 bool VPRecipeBase::mayReadFromMemory() const {
100   switch (getVPDefID()) {
101   case VPInstructionSC:
102     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
103   case VPWidenLoadEVLSC:
104   case VPWidenLoadSC:
105     return true;
106   case VPReplicateSC:
107     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
108         ->mayReadFromMemory();
109   case VPWidenCallSC:
110     return !cast<VPWidenCallRecipe>(this)
111                 ->getCalledScalarFunction()
112                 ->onlyWritesMemory();
113   case VPWidenIntrinsicSC:
114     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
115   case VPBranchOnMaskSC:
116   case VPPredInstPHISC:
117   case VPScalarIVStepsSC:
118   case VPWidenStoreEVLSC:
119   case VPWidenStoreSC:
120     return false;
121   case VPBlendSC:
122   case VPReductionEVLSC:
123   case VPReductionSC:
124   case VPVectorPointerSC:
125   case VPWidenCanonicalIVSC:
126   case VPWidenCastSC:
127   case VPWidenGEPSC:
128   case VPWidenIntOrFpInductionSC:
129   case VPWidenPHISC:
130   case VPWidenSC:
131   case VPWidenEVLSC:
132   case VPWidenSelectSC: {
133     const Instruction *I =
134         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
135     (void)I;
136     assert((!I || !I->mayReadFromMemory()) &&
137            "underlying instruction may read from memory");
138     return false;
139   }
140   default:
141     return true;
142   }
143 }
144 
145 bool VPRecipeBase::mayHaveSideEffects() const {
146   switch (getVPDefID()) {
147   case VPDerivedIVSC:
148   case VPPredInstPHISC:
149   case VPScalarCastSC:
150   case VPReverseVectorPointerSC:
151     return false;
152   case VPInstructionSC:
153     return mayWriteToMemory();
154   case VPWidenCallSC: {
155     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
156     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
157   }
158   case VPWidenIntrinsicSC:
159     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
160   case VPBlendSC:
161   case VPReductionEVLSC:
162   case VPReductionSC:
163   case VPScalarIVStepsSC:
164   case VPVectorPointerSC:
165   case VPWidenCanonicalIVSC:
166   case VPWidenCastSC:
167   case VPWidenGEPSC:
168   case VPWidenIntOrFpInductionSC:
169   case VPWidenPHISC:
170   case VPWidenPointerInductionSC:
171   case VPWidenSC:
172   case VPWidenEVLSC:
173   case VPWidenSelectSC: {
174     const Instruction *I =
175         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
176     (void)I;
177     assert((!I || !I->mayHaveSideEffects()) &&
178            "underlying instruction has side-effects");
179     return false;
180   }
181   case VPInterleaveSC:
182     return mayWriteToMemory();
183   case VPWidenLoadEVLSC:
184   case VPWidenLoadSC:
185   case VPWidenStoreEVLSC:
186   case VPWidenStoreSC:
187     assert(
188         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
189             mayWriteToMemory() &&
190         "mayHaveSideffects result for ingredient differs from this "
191         "implementation");
192     return mayWriteToMemory();
193   case VPReplicateSC: {
194     auto *R = cast<VPReplicateRecipe>(this);
195     return R->getUnderlyingInstr()->mayHaveSideEffects();
196   }
197   default:
198     return true;
199   }
200 }
201 
202 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
203   assert(!Parent && "Recipe already in some VPBasicBlock");
204   assert(InsertPos->getParent() &&
205          "Insertion position not in any VPBasicBlock");
206   InsertPos->getParent()->insert(this, InsertPos->getIterator());
207 }
208 
209 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
210                                 iplist<VPRecipeBase>::iterator I) {
211   assert(!Parent && "Recipe already in some VPBasicBlock");
212   assert(I == BB.end() || I->getParent() == &BB);
213   BB.insert(this, I);
214 }
215 
216 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
217   assert(!Parent && "Recipe already in some VPBasicBlock");
218   assert(InsertPos->getParent() &&
219          "Insertion position not in any VPBasicBlock");
220   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
221 }
222 
223 void VPRecipeBase::removeFromParent() {
224   assert(getParent() && "Recipe not in any VPBasicBlock");
225   getParent()->getRecipeList().remove(getIterator());
226   Parent = nullptr;
227 }
228 
229 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
230   assert(getParent() && "Recipe not in any VPBasicBlock");
231   return getParent()->getRecipeList().erase(getIterator());
232 }
233 
234 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
235   removeFromParent();
236   insertAfter(InsertPos);
237 }
238 
239 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
240                               iplist<VPRecipeBase>::iterator I) {
241   removeFromParent();
242   insertBefore(BB, I);
243 }
244 
245 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
246   // Get the underlying instruction for the recipe, if there is one. It is used
247   // to
248   //   * decide if cost computation should be skipped for this recipe,
249   //   * apply forced target instruction cost.
250   Instruction *UI = nullptr;
251   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
252     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
253   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
254     UI = IG->getInsertPos();
255   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
256     UI = &WidenMem->getIngredient();
257 
258   InstructionCost RecipeCost;
259   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
260     RecipeCost = 0;
261   } else {
262     RecipeCost = computeCost(VF, Ctx);
263     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
264         RecipeCost.isValid())
265       RecipeCost = InstructionCost(ForceTargetInstructionCost);
266   }
267 
268   LLVM_DEBUG({
269     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
270     dump();
271   });
272   return RecipeCost;
273 }
274 
275 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
276                                           VPCostContext &Ctx) const {
277   llvm_unreachable("subclasses should implement computeCost");
278 }
279 
280 InstructionCost
281 VPPartialReductionRecipe::computeCost(ElementCount VF,
282                                       VPCostContext &Ctx) const {
283   std::optional<unsigned> Opcode = std::nullopt;
284   VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
285   if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
286     Opcode = std::make_optional(WidenR->getOpcode());
287 
288   VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
289   VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
290 
291   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
292   auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
293                                                      : BinOpR->getOperand(0));
294   auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
295                                                      : BinOpR->getOperand(1));
296 
297   auto GetExtendKind = [](VPRecipeBase *R) {
298     // The extend could come from outside the plan.
299     if (!R)
300       return TargetTransformInfo::PR_None;
301     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
302     if (!WidenCastR)
303       return TargetTransformInfo::PR_None;
304     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
305       return TargetTransformInfo::PR_ZeroExtend;
306     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
307       return TargetTransformInfo::PR_SignExtend;
308     return TargetTransformInfo::PR_None;
309   };
310 
311   return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
312                                          PhiType, VF, GetExtendKind(ExtAR),
313                                          GetExtendKind(ExtBR), Opcode);
314 }
315 
316 void VPPartialReductionRecipe::execute(VPTransformState &State) {
317   State.setDebugLocFrom(getDebugLoc());
318   auto &Builder = State.Builder;
319 
320   assert(getOpcode() == Instruction::Add &&
321          "Unhandled partial reduction opcode");
322 
323   Value *BinOpVal = State.get(getOperand(0));
324   Value *PhiVal = State.get(getOperand(1));
325   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
326 
327   Type *RetTy = PhiVal->getType();
328 
329   CallInst *V = Builder.CreateIntrinsic(
330       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
331       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
332 
333   State.set(this, V);
334 }
335 
336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
337 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
338                                      VPSlotTracker &SlotTracker) const {
339   O << Indent << "PARTIAL-REDUCE ";
340   printAsOperand(O, SlotTracker);
341   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
342   printOperands(O, SlotTracker);
343 }
344 #endif
345 
346 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
347   assert(OpType == OperationType::FPMathOp &&
348          "recipe doesn't have fast math flags");
349   FastMathFlags Res;
350   Res.setAllowReassoc(FMFs.AllowReassoc);
351   Res.setNoNaNs(FMFs.NoNaNs);
352   Res.setNoInfs(FMFs.NoInfs);
353   Res.setNoSignedZeros(FMFs.NoSignedZeros);
354   Res.setAllowReciprocal(FMFs.AllowReciprocal);
355   Res.setAllowContract(FMFs.AllowContract);
356   Res.setApproxFunc(FMFs.ApproxFunc);
357   return Res;
358 }
359 
360 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
361 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
362 #endif
363 
364 template <unsigned PartOpIdx>
365 VPValue *
366 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
367   if (U.getNumOperands() == PartOpIdx + 1)
368     return U.getOperand(PartOpIdx);
369   return nullptr;
370 }
371 
372 template <unsigned PartOpIdx>
373 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
374   if (auto *UnrollPartOp = getUnrollPartOperand(U))
375     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
376   return 0;
377 }
378 
379 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
380                              VPValue *A, VPValue *B, DebugLoc DL,
381                              const Twine &Name)
382     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
383                           Pred, DL),
384       Opcode(Opcode), Name(Name.str()) {
385   assert(Opcode == Instruction::ICmp &&
386          "only ICmp predicates supported at the moment");
387 }
388 
389 VPInstruction::VPInstruction(unsigned Opcode,
390                              std::initializer_list<VPValue *> Operands,
391                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
392     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
393       Opcode(Opcode), Name(Name.str()) {
394   // Make sure the VPInstruction is a floating-point operation.
395   assert(isFPMathOp() && "this op can't take fast-math flags");
396 }
397 
398 bool VPInstruction::doesGeneratePerAllLanes() const {
399   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
400 }
401 
402 bool VPInstruction::canGenerateScalarForFirstLane() const {
403   if (Instruction::isBinaryOp(getOpcode()))
404     return true;
405   if (isSingleScalar() || isVectorToScalar())
406     return true;
407   switch (Opcode) {
408   case Instruction::ICmp:
409   case Instruction::Select:
410   case VPInstruction::BranchOnCond:
411   case VPInstruction::BranchOnCount:
412   case VPInstruction::CalculateTripCountMinusVF:
413   case VPInstruction::CanonicalIVIncrementForPart:
414   case VPInstruction::PtrAdd:
415   case VPInstruction::ExplicitVectorLength:
416   case VPInstruction::AnyOf:
417     return true;
418   default:
419     return false;
420   }
421 }
422 
423 Value *VPInstruction::generatePerLane(VPTransformState &State,
424                                       const VPLane &Lane) {
425   IRBuilderBase &Builder = State.Builder;
426 
427   assert(getOpcode() == VPInstruction::PtrAdd &&
428          "only PtrAdd opcodes are supported for now");
429   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
430                               State.get(getOperand(1), Lane), Name);
431 }
432 
433 Value *VPInstruction::generate(VPTransformState &State) {
434   IRBuilderBase &Builder = State.Builder;
435 
436   if (Instruction::isBinaryOp(getOpcode())) {
437     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
438     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
439     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
440     auto *Res =
441         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
442     if (auto *I = dyn_cast<Instruction>(Res))
443       setFlags(I);
444     return Res;
445   }
446 
447   switch (getOpcode()) {
448   case VPInstruction::Not: {
449     Value *A = State.get(getOperand(0));
450     return Builder.CreateNot(A, Name);
451   }
452   case Instruction::ICmp: {
453     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
454     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
455     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
456     return Builder.CreateCmp(getPredicate(), A, B, Name);
457   }
458   case Instruction::Select: {
459     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
460     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
461     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
462     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
463     return Builder.CreateSelect(Cond, Op1, Op2, Name);
464   }
465   case VPInstruction::ActiveLaneMask: {
466     // Get first lane of vector induction variable.
467     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
468     // Get the original loop tripcount.
469     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
470 
471     // If this part of the active lane mask is scalar, generate the CMP directly
472     // to avoid unnecessary extracts.
473     if (State.VF.isScalar())
474       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
475                                Name);
476 
477     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
478     auto *PredTy = VectorType::get(Int1Ty, State.VF);
479     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
480                                    {PredTy, ScalarTC->getType()},
481                                    {VIVElem0, ScalarTC}, nullptr, Name);
482   }
483   case VPInstruction::FirstOrderRecurrenceSplice: {
484     // Generate code to combine the previous and current values in vector v3.
485     //
486     //   vector.ph:
487     //     v_init = vector(..., ..., ..., a[-1])
488     //     br vector.body
489     //
490     //   vector.body
491     //     i = phi [0, vector.ph], [i+4, vector.body]
492     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
493     //     v2 = a[i, i+1, i+2, i+3];
494     //     v3 = vector(v1(3), v2(0, 1, 2))
495 
496     auto *V1 = State.get(getOperand(0));
497     if (!V1->getType()->isVectorTy())
498       return V1;
499     Value *V2 = State.get(getOperand(1));
500     return Builder.CreateVectorSplice(V1, V2, -1, Name);
501   }
502   case VPInstruction::CalculateTripCountMinusVF: {
503     unsigned UF = getParent()->getPlan()->getUF();
504     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
505     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
506     Value *Sub = Builder.CreateSub(ScalarTC, Step);
507     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
508     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
509     return Builder.CreateSelect(Cmp, Sub, Zero);
510   }
511   case VPInstruction::ExplicitVectorLength: {
512     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
513     // be outside of the main loop.
514     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
515     // Compute EVL
516     assert(AVL->getType()->isIntegerTy() &&
517            "Requested vector length should be an integer.");
518 
519     assert(State.VF.isScalable() && "Expected scalable vector factor.");
520     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
521 
522     Value *EVL = State.Builder.CreateIntrinsic(
523         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
524         {AVL, VFArg, State.Builder.getTrue()});
525     return EVL;
526   }
527   case VPInstruction::CanonicalIVIncrementForPart: {
528     unsigned Part = getUnrollPart(*this);
529     auto *IV = State.get(getOperand(0), VPLane(0));
530     assert(Part != 0 && "Must have a positive part");
531     // The canonical IV is incremented by the vectorization factor (num of
532     // SIMD elements) times the unroll part.
533     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
534     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
535                              hasNoSignedWrap());
536   }
537   case VPInstruction::BranchOnCond: {
538     Value *Cond = State.get(getOperand(0), VPLane(0));
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination for exiting blocks now and
541     // to forward destination(s) later when they are created.
542     BranchInst *CondBr =
543         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
544     CondBr->setSuccessor(0, nullptr);
545     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
546 
547     if (!getParent()->isExiting())
548       return CondBr;
549 
550     VPRegionBlock *ParentRegion = getParent()->getParent();
551     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
552     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
553     return CondBr;
554   }
555   case VPInstruction::BranchOnCount: {
556     // First create the compare.
557     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
558     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
559     Value *Cond = Builder.CreateICmpEQ(IV, TC);
560 
561     // Now create the branch.
562     auto *Plan = getParent()->getPlan();
563     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
564     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
565 
566     // Replace the temporary unreachable terminator with a new conditional
567     // branch, hooking it up to backward destination (the header) now and to the
568     // forward destination (the exit/middle block) later when it is created.
569     // Note that CreateCondBr expects a valid BB as first argument, so we need
570     // to set it to nullptr later.
571     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
572                                               State.CFG.VPBB2IRBB[Header]);
573     CondBr->setSuccessor(0, nullptr);
574     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
575     return CondBr;
576   }
577   case VPInstruction::ComputeReductionResult: {
578     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
579     // and will be removed by breaking up the recipe further.
580     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
581     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
582     // Get its reduction variable descriptor.
583     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
584 
585     RecurKind RK = RdxDesc.getRecurrenceKind();
586 
587     Type *PhiTy = OrigPhi->getType();
588     // The recipe's operands are the reduction phi, followed by one operand for
589     // each part of the reduction.
590     unsigned UF = getNumOperands() - 1;
591     VectorParts RdxParts(UF);
592     for (unsigned Part = 0; Part < UF; ++Part)
593       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
594 
595     // If the vector reduction can be performed in a smaller type, we truncate
596     // then extend the loop exit value to enable InstCombine to evaluate the
597     // entire expression in the smaller type.
598     // TODO: Handle this in truncateToMinBW.
599     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
600       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
601       for (unsigned Part = 0; Part < UF; ++Part)
602         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
603     }
604     // Reduce all of the unrolled parts into a single vector.
605     Value *ReducedPartRdx = RdxParts[0];
606     unsigned Op = RdxDesc.getOpcode();
607     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
608       Op = Instruction::Or;
609 
610     if (PhiR->isOrdered()) {
611       ReducedPartRdx = RdxParts[UF - 1];
612     } else {
613       // Floating-point operations should have some FMF to enable the reduction.
614       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
615       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
616       for (unsigned Part = 1; Part < UF; ++Part) {
617         Value *RdxPart = RdxParts[Part];
618         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
619           ReducedPartRdx = Builder.CreateBinOp(
620               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
621         else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
622           ReducedPartRdx =
623               createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
624         else
625           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
626       }
627     }
628 
629     // Create the reduction after the loop. Note that inloop reductions create
630     // the target reduction in the loop using a Reduction recipe.
631     if ((State.VF.isVector() ||
632          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
633          RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
634         !PhiR->isInLoop()) {
635       ReducedPartRdx =
636           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
637       // If the reduction can be performed in a smaller type, we need to extend
638       // the reduction to the wider type before we branch to the original loop.
639       if (PhiTy != RdxDesc.getRecurrenceType())
640         ReducedPartRdx = RdxDesc.isSigned()
641                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
642                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
643     }
644 
645     return ReducedPartRdx;
646   }
647   case VPInstruction::ExtractFromEnd: {
648     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
649     unsigned Offset = CI->getZExtValue();
650     assert(Offset > 0 && "Offset from end must be positive");
651     Value *Res;
652     if (State.VF.isVector()) {
653       assert(Offset <= State.VF.getKnownMinValue() &&
654              "invalid offset to extract from");
655       // Extract lane VF - Offset from the operand.
656       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
657     } else {
658       assert(Offset <= 1 && "invalid offset to extract from");
659       Res = State.get(getOperand(0));
660     }
661     if (isa<ExtractElementInst>(Res))
662       Res->setName(Name);
663     return Res;
664   }
665   case VPInstruction::LogicalAnd: {
666     Value *A = State.get(getOperand(0));
667     Value *B = State.get(getOperand(1));
668     return Builder.CreateLogicalAnd(A, B, Name);
669   }
670   case VPInstruction::PtrAdd: {
671     assert(vputils::onlyFirstLaneUsed(this) &&
672            "can only generate first lane for PtrAdd");
673     Value *Ptr = State.get(getOperand(0), VPLane(0));
674     Value *Addend = State.get(getOperand(1), VPLane(0));
675     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
676   }
677   case VPInstruction::ResumePhi: {
678     Value *IncomingFromVPlanPred =
679         State.get(getOperand(0), /* IsScalar */ true);
680     Value *IncomingFromOtherPreds =
681         State.get(getOperand(1), /* IsScalar */ true);
682     auto *NewPhi =
683         Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
684     BasicBlock *VPlanPred =
685         State.CFG
686             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
687     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
688     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
689       if (OtherPred == VPlanPred)
690         continue;
691       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
692     }
693     return NewPhi;
694   }
695   case VPInstruction::AnyOf: {
696     Value *A = State.get(getOperand(0));
697     return Builder.CreateOrReduce(A);
698   }
699 
700   default:
701     llvm_unreachable("Unsupported opcode for instruction");
702   }
703 }
704 
705 bool VPInstruction::isVectorToScalar() const {
706   return getOpcode() == VPInstruction::ExtractFromEnd ||
707          getOpcode() == VPInstruction::ComputeReductionResult ||
708          getOpcode() == VPInstruction::AnyOf;
709 }
710 
711 bool VPInstruction::isSingleScalar() const {
712   return getOpcode() == VPInstruction::ResumePhi;
713 }
714 
715 #if !defined(NDEBUG)
716 bool VPInstruction::isFPMathOp() const {
717   // Inspired by FPMathOperator::classof. Notable differences are that we don't
718   // support Call, PHI and Select opcodes here yet.
719   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
720          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
721          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
722          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
723 }
724 #endif
725 
726 void VPInstruction::execute(VPTransformState &State) {
727   assert(!State.Lane && "VPInstruction executing an Lane");
728   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
729   assert((hasFastMathFlags() == isFPMathOp() ||
730           getOpcode() == Instruction::Select) &&
731          "Recipe not a FPMathOp but has fast-math flags?");
732   if (hasFastMathFlags())
733     State.Builder.setFastMathFlags(getFastMathFlags());
734   State.setDebugLocFrom(getDebugLoc());
735   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
736                                    (vputils::onlyFirstLaneUsed(this) ||
737                                     isVectorToScalar() || isSingleScalar());
738   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
739   if (GeneratesPerAllLanes) {
740     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
741          Lane != NumLanes; ++Lane) {
742       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
743       assert(GeneratedValue && "generatePerLane must produce a value");
744       State.set(this, GeneratedValue, VPLane(Lane));
745     }
746     return;
747   }
748 
749   Value *GeneratedValue = generate(State);
750   if (!hasResult())
751     return;
752   assert(GeneratedValue && "generate must produce a value");
753   assert(
754       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
755        State.VF.isScalar()) &&
756       "scalar value but not only first lane defined");
757   State.set(this, GeneratedValue,
758             /*IsScalar*/ GeneratesPerFirstLaneOnly);
759 }
760 
761 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
762   if (Instruction::isBinaryOp(getOpcode()))
763     return false;
764   switch (getOpcode()) {
765   case Instruction::ICmp:
766   case Instruction::Select:
767   case VPInstruction::AnyOf:
768   case VPInstruction::CalculateTripCountMinusVF:
769   case VPInstruction::CanonicalIVIncrementForPart:
770   case VPInstruction::ExtractFromEnd:
771   case VPInstruction::FirstOrderRecurrenceSplice:
772   case VPInstruction::LogicalAnd:
773   case VPInstruction::Not:
774   case VPInstruction::PtrAdd:
775     return false;
776   default:
777     return true;
778   }
779 }
780 
781 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
782   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
783   if (Instruction::isBinaryOp(getOpcode()))
784     return vputils::onlyFirstLaneUsed(this);
785 
786   switch (getOpcode()) {
787   default:
788     return false;
789   case Instruction::ICmp:
790   case Instruction::Select:
791   case Instruction::Or:
792   case VPInstruction::PtrAdd:
793     // TODO: Cover additional opcodes.
794     return vputils::onlyFirstLaneUsed(this);
795   case VPInstruction::ActiveLaneMask:
796   case VPInstruction::ExplicitVectorLength:
797   case VPInstruction::CalculateTripCountMinusVF:
798   case VPInstruction::CanonicalIVIncrementForPart:
799   case VPInstruction::BranchOnCount:
800   case VPInstruction::BranchOnCond:
801   case VPInstruction::ResumePhi:
802     return true;
803   };
804   llvm_unreachable("switch should return");
805 }
806 
807 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
808   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
809   if (Instruction::isBinaryOp(getOpcode()))
810     return vputils::onlyFirstPartUsed(this);
811 
812   switch (getOpcode()) {
813   default:
814     return false;
815   case Instruction::ICmp:
816   case Instruction::Select:
817     return vputils::onlyFirstPartUsed(this);
818   case VPInstruction::BranchOnCount:
819   case VPInstruction::BranchOnCond:
820   case VPInstruction::CanonicalIVIncrementForPart:
821     return true;
822   };
823   llvm_unreachable("switch should return");
824 }
825 
826 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
827 void VPInstruction::dump() const {
828   VPSlotTracker SlotTracker(getParent()->getPlan());
829   print(dbgs(), "", SlotTracker);
830 }
831 
832 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
833                           VPSlotTracker &SlotTracker) const {
834   O << Indent << "EMIT ";
835 
836   if (hasResult()) {
837     printAsOperand(O, SlotTracker);
838     O << " = ";
839   }
840 
841   switch (getOpcode()) {
842   case VPInstruction::Not:
843     O << "not";
844     break;
845   case VPInstruction::SLPLoad:
846     O << "combined load";
847     break;
848   case VPInstruction::SLPStore:
849     O << "combined store";
850     break;
851   case VPInstruction::ActiveLaneMask:
852     O << "active lane mask";
853     break;
854   case VPInstruction::ResumePhi:
855     O << "resume-phi";
856     break;
857   case VPInstruction::ExplicitVectorLength:
858     O << "EXPLICIT-VECTOR-LENGTH";
859     break;
860   case VPInstruction::FirstOrderRecurrenceSplice:
861     O << "first-order splice";
862     break;
863   case VPInstruction::BranchOnCond:
864     O << "branch-on-cond";
865     break;
866   case VPInstruction::CalculateTripCountMinusVF:
867     O << "TC > VF ? TC - VF : 0";
868     break;
869   case VPInstruction::CanonicalIVIncrementForPart:
870     O << "VF * Part +";
871     break;
872   case VPInstruction::BranchOnCount:
873     O << "branch-on-count";
874     break;
875   case VPInstruction::ExtractFromEnd:
876     O << "extract-from-end";
877     break;
878   case VPInstruction::ComputeReductionResult:
879     O << "compute-reduction-result";
880     break;
881   case VPInstruction::LogicalAnd:
882     O << "logical-and";
883     break;
884   case VPInstruction::PtrAdd:
885     O << "ptradd";
886     break;
887   case VPInstruction::AnyOf:
888     O << "any-of";
889     break;
890   default:
891     O << Instruction::getOpcodeName(getOpcode());
892   }
893 
894   printFlags(O);
895   printOperands(O, SlotTracker);
896 
897   if (auto DL = getDebugLoc()) {
898     O << ", !dbg ";
899     DL.print(O);
900   }
901 }
902 #endif
903 
904 void VPIRInstruction::execute(VPTransformState &State) {
905   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
906          "Only PHINodes can have extra operands");
907   for (const auto &[Idx, Op] : enumerate(operands())) {
908     VPValue *ExitValue = Op;
909     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
910                     ? VPLane::getFirstLane()
911                     : VPLane::getLastLaneForVF(State.VF);
912     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
913     auto *PredVPBB = Pred->getExitingBasicBlock();
914     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
915     // Set insertion point in PredBB in case an extract needs to be generated.
916     // TODO: Model extracts explicitly.
917     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
918     Value *V = State.get(ExitValue, VPLane(Lane));
919     auto *Phi = cast<PHINode>(&I);
920     // If there is no existing block for PredBB in the phi, add a new incoming
921     // value. Otherwise update the existing incoming value for PredBB.
922     if (Phi->getBasicBlockIndex(PredBB) == -1)
923       Phi->addIncoming(V, PredBB);
924     else
925       Phi->setIncomingValueForBlock(PredBB, V);
926   }
927 
928   // Advance the insert point after the wrapped IR instruction. This allows
929   // interleaving VPIRInstructions and other recipes.
930   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
931 }
932 
933 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
934                                              VPCostContext &Ctx) const {
935   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
936   // hence it does not contribute to the cost-modeling for the VPlan.
937   return 0;
938 }
939 
940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
941 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
942                             VPSlotTracker &SlotTracker) const {
943   O << Indent << "IR " << I;
944 
945   if (getNumOperands() != 0) {
946     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
947     interleaveComma(
948         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
949           Op.value()->printAsOperand(O, SlotTracker);
950           O << " from ";
951           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
952         });
953     O << ")";
954   }
955 }
956 #endif
957 
958 void VPWidenCallRecipe::execute(VPTransformState &State) {
959   assert(State.VF.isVector() && "not widening");
960   State.setDebugLocFrom(getDebugLoc());
961 
962   FunctionType *VFTy = Variant->getFunctionType();
963   // Add return type if intrinsic is overloaded on it.
964   SmallVector<Value *, 4> Args;
965   for (const auto &I : enumerate(arg_operands())) {
966     Value *Arg;
967     // Some vectorized function variants may also take a scalar argument,
968     // e.g. linear parameters for pointers. This needs to be the scalar value
969     // from the start of the respective part when interleaving.
970     if (!VFTy->getParamType(I.index())->isVectorTy())
971       Arg = State.get(I.value(), VPLane(0));
972     else
973       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
974     Args.push_back(Arg);
975   }
976 
977   assert(Variant != nullptr && "Can't create vector function.");
978 
979   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
980   SmallVector<OperandBundleDef, 1> OpBundles;
981   if (CI)
982     CI->getOperandBundlesAsDefs(OpBundles);
983 
984   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
985   setFlags(V);
986 
987   if (!V->getType()->isVoidTy())
988     State.set(this, V);
989   State.addMetadata(V, CI);
990 }
991 
992 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
993                                                VPCostContext &Ctx) const {
994   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
995                                   Variant->getFunctionType()->params(),
996                                   Ctx.CostKind);
997 }
998 
999 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1000 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1001                               VPSlotTracker &SlotTracker) const {
1002   O << Indent << "WIDEN-CALL ";
1003 
1004   Function *CalledFn = getCalledScalarFunction();
1005   if (CalledFn->getReturnType()->isVoidTy())
1006     O << "void ";
1007   else {
1008     printAsOperand(O, SlotTracker);
1009     O << " = ";
1010   }
1011 
1012   O << "call";
1013   printFlags(O);
1014   O << " @" << CalledFn->getName() << "(";
1015   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1016     Op->printAsOperand(O, SlotTracker);
1017   });
1018   O << ")";
1019 
1020   O << " (using library function";
1021   if (Variant->hasName())
1022     O << ": " << Variant->getName();
1023   O << ")";
1024 }
1025 #endif
1026 
1027 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1028   assert(State.VF.isVector() && "not widening");
1029   State.setDebugLocFrom(getDebugLoc());
1030 
1031   SmallVector<Type *, 2> TysForDecl;
1032   // Add return type if intrinsic is overloaded on it.
1033   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1034     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1035   SmallVector<Value *, 4> Args;
1036   for (const auto &I : enumerate(operands())) {
1037     // Some intrinsics have a scalar argument - don't replace it with a
1038     // vector.
1039     Value *Arg;
1040     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1041                                            State.TTI))
1042       Arg = State.get(I.value(), VPLane(0));
1043     else
1044       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1045     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1046                                                State.TTI))
1047       TysForDecl.push_back(Arg->getType());
1048     Args.push_back(Arg);
1049   }
1050 
1051   // Use vector version of the intrinsic.
1052   Module *M = State.Builder.GetInsertBlock()->getModule();
1053   Function *VectorF =
1054       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1055   assert(VectorF &&
1056          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1057 
1058   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1059   SmallVector<OperandBundleDef, 1> OpBundles;
1060   if (CI)
1061     CI->getOperandBundlesAsDefs(OpBundles);
1062 
1063   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1064 
1065   setFlags(V);
1066 
1067   if (!V->getType()->isVoidTy())
1068     State.set(this, V);
1069   State.addMetadata(V, CI);
1070 }
1071 
1072 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1073                                                     VPCostContext &Ctx) const {
1074   // Some backends analyze intrinsic arguments to determine cost. Use the
1075   // underlying value for the operand if it has one. Otherwise try to use the
1076   // operand of the underlying call instruction, if there is one. Otherwise
1077   // clear Arguments.
1078   // TODO: Rework TTI interface to be independent of concrete IR values.
1079   SmallVector<const Value *> Arguments;
1080   for (const auto &[Idx, Op] : enumerate(operands())) {
1081     auto *V = Op->getUnderlyingValue();
1082     if (!V) {
1083       // Push all the VP Intrinsic's ops into the Argments even if is nullptr.
1084       // Some VP Intrinsic's cost will assert the number of parameters.
1085       // Mainly appears in the following two scenarios:
1086       // 1. EVL Op is nullptr
1087       // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic
1088       if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
1089         Arguments.push_back(V);
1090         continue;
1091       }
1092       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1093         Arguments.push_back(UI->getArgOperand(Idx));
1094         continue;
1095       }
1096       Arguments.clear();
1097       break;
1098     }
1099     Arguments.push_back(V);
1100   }
1101 
1102   Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1103   SmallVector<Type *> ParamTys;
1104   for (unsigned I = 0; I != getNumOperands(); ++I)
1105     ParamTys.push_back(
1106         toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1107 
1108   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1109   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1110   IntrinsicCostAttributes CostAttrs(
1111       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1112       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1113   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1114 }
1115 
1116 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1117   return Intrinsic::getBaseName(VectorIntrinsicID);
1118 }
1119 
1120 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1121   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1122   // Vector predication intrinsics only demand the the first lane the last
1123   // operand (the EVL operand).
1124   return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1125          Op == getOperand(getNumOperands() - 1);
1126 }
1127 
1128 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1129 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1130                                    VPSlotTracker &SlotTracker) const {
1131   O << Indent << "WIDEN-INTRINSIC ";
1132   if (ResultTy->isVoidTy()) {
1133     O << "void ";
1134   } else {
1135     printAsOperand(O, SlotTracker);
1136     O << " = ";
1137   }
1138 
1139   O << "call";
1140   printFlags(O);
1141   O << getIntrinsicName() << "(";
1142 
1143   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1144     Op->printAsOperand(O, SlotTracker);
1145   });
1146   O << ")";
1147 }
1148 #endif
1149 
1150 void VPHistogramRecipe::execute(VPTransformState &State) {
1151   State.setDebugLocFrom(getDebugLoc());
1152   IRBuilderBase &Builder = State.Builder;
1153 
1154   Value *Address = State.get(getOperand(0));
1155   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1156   VectorType *VTy = cast<VectorType>(Address->getType());
1157 
1158   // The histogram intrinsic requires a mask even if the recipe doesn't;
1159   // if the mask operand was omitted then all lanes should be executed and
1160   // we just need to synthesize an all-true mask.
1161   Value *Mask = nullptr;
1162   if (VPValue *VPMask = getMask())
1163     Mask = State.get(VPMask);
1164   else
1165     Mask =
1166         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1167 
1168   // If this is a subtract, we want to invert the increment amount. We may
1169   // add a separate intrinsic in future, but for now we'll try this.
1170   if (Opcode == Instruction::Sub)
1171     IncAmt = Builder.CreateNeg(IncAmt);
1172   else
1173     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1174 
1175   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1176                                 {VTy, IncAmt->getType()},
1177                                 {Address, IncAmt, Mask});
1178 }
1179 
1180 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1181                                                VPCostContext &Ctx) const {
1182   // FIXME: Take the gather and scatter into account as well. For now we're
1183   //        generating the same cost as the fallback path, but we'll likely
1184   //        need to create a new TTI method for determining the cost, including
1185   //        whether we can use base + vec-of-smaller-indices or just
1186   //        vec-of-pointers.
1187   assert(VF.isVector() && "Invalid VF for histogram cost");
1188   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1189   VPValue *IncAmt = getOperand(1);
1190   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1191   VectorType *VTy = VectorType::get(IncTy, VF);
1192 
1193   // Assume that a non-constant update value (or a constant != 1) requires
1194   // a multiply, and add that into the cost.
1195   InstructionCost MulCost =
1196       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1197   if (IncAmt->isLiveIn()) {
1198     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1199 
1200     if (CI && CI->getZExtValue() == 1)
1201       MulCost = TTI::TCC_Free;
1202   }
1203 
1204   // Find the cost of the histogram operation itself.
1205   Type *PtrTy = VectorType::get(AddressTy, VF);
1206   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1207   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1208                               Type::getVoidTy(Ctx.LLVMCtx),
1209                               {PtrTy, IncTy, MaskTy});
1210 
1211   // Add the costs together with the add/sub operation.
1212   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1213          Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1214 }
1215 
1216 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1217 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1218                               VPSlotTracker &SlotTracker) const {
1219   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1220   getOperand(0)->printAsOperand(O, SlotTracker);
1221 
1222   if (Opcode == Instruction::Sub)
1223     O << ", dec: ";
1224   else {
1225     assert(Opcode == Instruction::Add);
1226     O << ", inc: ";
1227   }
1228   getOperand(1)->printAsOperand(O, SlotTracker);
1229 
1230   if (VPValue *Mask = getMask()) {
1231     O << ", mask: ";
1232     Mask->printAsOperand(O, SlotTracker);
1233   }
1234 }
1235 
1236 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1237                                 VPSlotTracker &SlotTracker) const {
1238   O << Indent << "WIDEN-SELECT ";
1239   printAsOperand(O, SlotTracker);
1240   O << " = select ";
1241   printFlags(O);
1242   getOperand(0)->printAsOperand(O, SlotTracker);
1243   O << ", ";
1244   getOperand(1)->printAsOperand(O, SlotTracker);
1245   O << ", ";
1246   getOperand(2)->printAsOperand(O, SlotTracker);
1247   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1248 }
1249 #endif
1250 
1251 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1252   State.setDebugLocFrom(getDebugLoc());
1253 
1254   // The condition can be loop invariant but still defined inside the
1255   // loop. This means that we can't just use the original 'cond' value.
1256   // We have to take the 'vectorized' value and pick the first lane.
1257   // Instcombine will make this a no-op.
1258   auto *InvarCond =
1259       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1260 
1261   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1262   Value *Op0 = State.get(getOperand(1));
1263   Value *Op1 = State.get(getOperand(2));
1264   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1265   State.set(this, Sel);
1266   if (isa<FPMathOperator>(Sel))
1267     setFlags(cast<Instruction>(Sel));
1268   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1269 }
1270 
1271 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1272                                                  VPCostContext &Ctx) const {
1273   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1274   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1275   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1276   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1277 
1278   VPValue *Op0, *Op1;
1279   using namespace llvm::VPlanPatternMatch;
1280   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1281       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1282        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1283     // select x, y, false --> x & y
1284     // select x, true, y --> x | y
1285     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1286     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1287 
1288     SmallVector<const Value *, 2> Operands;
1289     if (all_of(operands(),
1290                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1291       Operands.append(SI->op_begin(), SI->op_end());
1292     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1293     return Ctx.TTI.getArithmeticInstrCost(
1294         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1295         Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1296   }
1297 
1298   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1299   if (!ScalarCond)
1300     CondTy = VectorType::get(CondTy, VF);
1301 
1302   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1303   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1304     Pred = Cmp->getPredicate();
1305   return Ctx.TTI.getCmpSelInstrCost(
1306       Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1307       {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1308 }
1309 
1310 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1311     const FastMathFlags &FMF) {
1312   AllowReassoc = FMF.allowReassoc();
1313   NoNaNs = FMF.noNaNs();
1314   NoInfs = FMF.noInfs();
1315   NoSignedZeros = FMF.noSignedZeros();
1316   AllowReciprocal = FMF.allowReciprocal();
1317   AllowContract = FMF.allowContract();
1318   ApproxFunc = FMF.approxFunc();
1319 }
1320 
1321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1322 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1323   switch (OpType) {
1324   case OperationType::Cmp:
1325     O << " " << CmpInst::getPredicateName(getPredicate());
1326     break;
1327   case OperationType::DisjointOp:
1328     if (DisjointFlags.IsDisjoint)
1329       O << " disjoint";
1330     break;
1331   case OperationType::PossiblyExactOp:
1332     if (ExactFlags.IsExact)
1333       O << " exact";
1334     break;
1335   case OperationType::OverflowingBinOp:
1336     if (WrapFlags.HasNUW)
1337       O << " nuw";
1338     if (WrapFlags.HasNSW)
1339       O << " nsw";
1340     break;
1341   case OperationType::FPMathOp:
1342     getFastMathFlags().print(O);
1343     break;
1344   case OperationType::GEPOp:
1345     if (GEPFlags.isInBounds())
1346       O << " inbounds";
1347     else if (GEPFlags.hasNoUnsignedSignedWrap())
1348       O << " nusw";
1349     if (GEPFlags.hasNoUnsignedWrap())
1350       O << " nuw";
1351     break;
1352   case OperationType::NonNegOp:
1353     if (NonNegFlags.NonNeg)
1354       O << " nneg";
1355     break;
1356   case OperationType::Other:
1357     break;
1358   }
1359   if (getNumOperands() > 0)
1360     O << " ";
1361 }
1362 #endif
1363 
1364 void VPWidenRecipe::execute(VPTransformState &State) {
1365   State.setDebugLocFrom(getDebugLoc());
1366   auto &Builder = State.Builder;
1367   switch (Opcode) {
1368   case Instruction::Call:
1369   case Instruction::Br:
1370   case Instruction::PHI:
1371   case Instruction::GetElementPtr:
1372   case Instruction::Select:
1373     llvm_unreachable("This instruction is handled by a different recipe.");
1374   case Instruction::UDiv:
1375   case Instruction::SDiv:
1376   case Instruction::SRem:
1377   case Instruction::URem:
1378   case Instruction::Add:
1379   case Instruction::FAdd:
1380   case Instruction::Sub:
1381   case Instruction::FSub:
1382   case Instruction::FNeg:
1383   case Instruction::Mul:
1384   case Instruction::FMul:
1385   case Instruction::FDiv:
1386   case Instruction::FRem:
1387   case Instruction::Shl:
1388   case Instruction::LShr:
1389   case Instruction::AShr:
1390   case Instruction::And:
1391   case Instruction::Or:
1392   case Instruction::Xor: {
1393     // Just widen unops and binops.
1394     SmallVector<Value *, 2> Ops;
1395     for (VPValue *VPOp : operands())
1396       Ops.push_back(State.get(VPOp));
1397 
1398     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1399 
1400     if (auto *VecOp = dyn_cast<Instruction>(V))
1401       setFlags(VecOp);
1402 
1403     // Use this vector value for all users of the original instruction.
1404     State.set(this, V);
1405     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1406     break;
1407   }
1408   case Instruction::Freeze: {
1409     Value *Op = State.get(getOperand(0));
1410 
1411     Value *Freeze = Builder.CreateFreeze(Op);
1412     State.set(this, Freeze);
1413     break;
1414   }
1415   case Instruction::ICmp:
1416   case Instruction::FCmp: {
1417     // Widen compares. Generate vector compares.
1418     bool FCmp = Opcode == Instruction::FCmp;
1419     Value *A = State.get(getOperand(0));
1420     Value *B = State.get(getOperand(1));
1421     Value *C = nullptr;
1422     if (FCmp) {
1423       // Propagate fast math flags.
1424       C = Builder.CreateFCmpFMF(
1425           getPredicate(), A, B,
1426           dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1427     } else {
1428       C = Builder.CreateICmp(getPredicate(), A, B);
1429     }
1430     State.set(this, C);
1431     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1432     break;
1433   }
1434   default:
1435     // This instruction is not vectorized by simple widening.
1436     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1437                       << Instruction::getOpcodeName(Opcode));
1438     llvm_unreachable("Unhandled instruction!");
1439   } // end of switch.
1440 
1441 #if !defined(NDEBUG)
1442   // Verify that VPlan type inference results agree with the type of the
1443   // generated values.
1444   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1445              State.get(this)->getType() &&
1446          "inferred type and type from generated instructions do not match");
1447 #endif
1448 }
1449 
1450 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1451                                            VPCostContext &Ctx) const {
1452   switch (Opcode) {
1453   case Instruction::FNeg: {
1454     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1455     return Ctx.TTI.getArithmeticInstrCost(
1456         Opcode, VectorTy, Ctx.CostKind,
1457         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1458         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1459   }
1460 
1461   case Instruction::UDiv:
1462   case Instruction::SDiv:
1463   case Instruction::SRem:
1464   case Instruction::URem:
1465     // More complex computation, let the legacy cost-model handle this for now.
1466     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1467   case Instruction::Add:
1468   case Instruction::FAdd:
1469   case Instruction::Sub:
1470   case Instruction::FSub:
1471   case Instruction::Mul:
1472   case Instruction::FMul:
1473   case Instruction::FDiv:
1474   case Instruction::FRem:
1475   case Instruction::Shl:
1476   case Instruction::LShr:
1477   case Instruction::AShr:
1478   case Instruction::And:
1479   case Instruction::Or:
1480   case Instruction::Xor: {
1481     VPValue *RHS = getOperand(1);
1482     // Certain instructions can be cheaper to vectorize if they have a constant
1483     // second vector operand. One example of this are shifts on x86.
1484     TargetTransformInfo::OperandValueInfo RHSInfo = {
1485         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1486     if (RHS->isLiveIn())
1487       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1488 
1489     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1490         getOperand(1)->isDefinedOutsideLoopRegions())
1491       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1492     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1493     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1494 
1495     SmallVector<const Value *, 4> Operands;
1496     if (CtxI)
1497       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1498     return Ctx.TTI.getArithmeticInstrCost(
1499         Opcode, VectorTy, Ctx.CostKind,
1500         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1501         RHSInfo, Operands, CtxI, &Ctx.TLI);
1502   }
1503   case Instruction::Freeze: {
1504     // This opcode is unknown. Assume that it is the same as 'mul'.
1505     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1506     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
1507                                           Ctx.CostKind);
1508   }
1509   case Instruction::ICmp:
1510   case Instruction::FCmp: {
1511     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1512     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1513     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1514                                       Ctx.CostKind,
1515                                       {TTI::OK_AnyValue, TTI::OP_None},
1516                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1517   }
1518   default:
1519     llvm_unreachable("Unsupported opcode for instruction");
1520   }
1521 }
1522 
1523 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1524   unsigned Opcode = getOpcode();
1525   // TODO: Support other opcodes
1526   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1527     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1528 
1529   State.setDebugLocFrom(getDebugLoc());
1530 
1531   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1532          "VPWidenEVLRecipe should not be used for scalars");
1533 
1534   VPValue *EVL = getEVL();
1535   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1536   IRBuilderBase &BuilderIR = State.Builder;
1537   VectorBuilder Builder(BuilderIR);
1538   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1539 
1540   SmallVector<Value *, 4> Ops;
1541   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1542     VPValue *VPOp = getOperand(I);
1543     Ops.push_back(State.get(VPOp));
1544   }
1545 
1546   Builder.setMask(Mask).setEVL(EVLArg);
1547   Value *VPInst =
1548       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1549   // Currently vp-intrinsics only accept FMF flags.
1550   // TODO: Enable other flags when support is added.
1551   if (isa<FPMathOperator>(VPInst))
1552     setFlags(cast<Instruction>(VPInst));
1553 
1554   State.set(this, VPInst);
1555   State.addMetadata(VPInst,
1556                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1557 }
1558 
1559 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1560 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1561                           VPSlotTracker &SlotTracker) const {
1562   O << Indent << "WIDEN ";
1563   printAsOperand(O, SlotTracker);
1564   O << " = " << Instruction::getOpcodeName(Opcode);
1565   printFlags(O);
1566   printOperands(O, SlotTracker);
1567 }
1568 
1569 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1570                              VPSlotTracker &SlotTracker) const {
1571   O << Indent << "WIDEN ";
1572   printAsOperand(O, SlotTracker);
1573   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1574   printFlags(O);
1575   printOperands(O, SlotTracker);
1576 }
1577 #endif
1578 
1579 void VPWidenCastRecipe::execute(VPTransformState &State) {
1580   State.setDebugLocFrom(getDebugLoc());
1581   auto &Builder = State.Builder;
1582   /// Vectorize casts.
1583   assert(State.VF.isVector() && "Not vectorizing?");
1584   Type *DestTy = VectorType::get(getResultType(), State.VF);
1585   VPValue *Op = getOperand(0);
1586   Value *A = State.get(Op);
1587   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1588   State.set(this, Cast);
1589   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1590   if (auto *CastOp = dyn_cast<Instruction>(Cast))
1591     setFlags(CastOp);
1592 }
1593 
1594 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1595                                                VPCostContext &Ctx) const {
1596   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
1597   // the legacy cost model, including truncates/extends when evaluating a
1598   // reduction in a smaller type.
1599   if (!getUnderlyingValue())
1600     return 0;
1601   // Computes the CastContextHint from a recipes that may access memory.
1602   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1603     if (VF.isScalar())
1604       return TTI::CastContextHint::Normal;
1605     if (isa<VPInterleaveRecipe>(R))
1606       return TTI::CastContextHint::Interleave;
1607     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
1608       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1609                                              : TTI::CastContextHint::Normal;
1610     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1611     if (WidenMemoryRecipe == nullptr)
1612       return TTI::CastContextHint::None;
1613     if (!WidenMemoryRecipe->isConsecutive())
1614       return TTI::CastContextHint::GatherScatter;
1615     if (WidenMemoryRecipe->isReverse())
1616       return TTI::CastContextHint::Reversed;
1617     if (WidenMemoryRecipe->isMasked())
1618       return TTI::CastContextHint::Masked;
1619     return TTI::CastContextHint::Normal;
1620   };
1621 
1622   VPValue *Operand = getOperand(0);
1623   TTI::CastContextHint CCH = TTI::CastContextHint::None;
1624   // For Trunc/FPTrunc, get the context from the only user.
1625   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
1626       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
1627     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
1628       CCH = ComputeCCH(StoreRecipe);
1629   }
1630   // For Z/Sext, get the context from the operand.
1631   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1632            Opcode == Instruction::FPExt) {
1633     if (Operand->isLiveIn())
1634       CCH = TTI::CastContextHint::Normal;
1635     else if (Operand->getDefiningRecipe())
1636       CCH = ComputeCCH(Operand->getDefiningRecipe());
1637   }
1638 
1639   auto *SrcTy =
1640       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
1641   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
1642   // Arm TTI will use the underlying instruction to determine the cost.
1643   return Ctx.TTI.getCastInstrCost(
1644       Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
1645       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1646 }
1647 
1648 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1649 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1650                               VPSlotTracker &SlotTracker) const {
1651   O << Indent << "WIDEN-CAST ";
1652   printAsOperand(O, SlotTracker);
1653   O << " = " << Instruction::getOpcodeName(Opcode);
1654   printFlags(O);
1655   printOperands(O, SlotTracker);
1656   O << " to " << *getResultType();
1657 }
1658 #endif
1659 
1660 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
1661                                                VPCostContext &Ctx) const {
1662   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
1663 }
1664 
1665 /// This function adds
1666 /// (0 * Step, 1 * Step, 2 * Step, ...)
1667 /// to each vector element of Val.
1668 /// \p Opcode is relevant for FP induction variable.
1669 static Value *getStepVector(Value *Val, Value *Step,
1670                             Instruction::BinaryOps BinOp, ElementCount VF,
1671                             IRBuilderBase &Builder) {
1672   assert(VF.isVector() && "only vector VFs are supported");
1673 
1674   // Create and check the types.
1675   auto *ValVTy = cast<VectorType>(Val->getType());
1676   ElementCount VLen = ValVTy->getElementCount();
1677 
1678   Type *STy = Val->getType()->getScalarType();
1679   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1680          "Induction Step must be an integer or FP");
1681   assert(Step->getType() == STy && "Step has wrong type");
1682 
1683   SmallVector<Constant *, 8> Indices;
1684 
1685   // Create a vector of consecutive numbers from zero to VF.
1686   VectorType *InitVecValVTy = ValVTy;
1687   if (STy->isFloatingPointTy()) {
1688     Type *InitVecValSTy =
1689         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1690     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1691   }
1692   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1693 
1694   if (STy->isIntegerTy()) {
1695     Step = Builder.CreateVectorSplat(VLen, Step);
1696     assert(Step->getType() == Val->getType() && "Invalid step vec");
1697     // FIXME: The newly created binary instructions should contain nsw/nuw
1698     // flags, which can be found from the original scalar operations.
1699     Step = Builder.CreateMul(InitVec, Step);
1700     return Builder.CreateAdd(Val, Step, "induction");
1701   }
1702 
1703   // Floating point induction.
1704   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1705          "Binary Opcode should be specified for FP induction");
1706   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1707 
1708   Step = Builder.CreateVectorSplat(VLen, Step);
1709   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1710   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1711 }
1712 
1713 /// A helper function that returns an integer or floating-point constant with
1714 /// value C.
1715 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1716   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1717                            : ConstantFP::get(Ty, C);
1718 }
1719 
1720 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1721   assert(!State.Lane && "Int or FP induction being replicated.");
1722 
1723   Value *Start = getStartValue()->getLiveInIRValue();
1724   const InductionDescriptor &ID = getInductionDescriptor();
1725   TruncInst *Trunc = getTruncInst();
1726   IRBuilderBase &Builder = State.Builder;
1727   assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
1728          "Types must match");
1729   assert(State.VF.isVector() && "must have vector VF");
1730 
1731   // The value from the original loop to which we are mapping the new induction
1732   // variable.
1733   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
1734 
1735   // Fast-math-flags propagate from the original induction instruction.
1736   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1737   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1738     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1739 
1740   // Now do the actual transformations, and start with fetching the step value.
1741   Value *Step = State.get(getStepValue(), VPLane(0));
1742 
1743   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1744          "Expected either an induction phi-node or a truncate of it!");
1745 
1746   // Construct the initial value of the vector IV in the vector loop preheader
1747   auto CurrIP = Builder.saveIP();
1748   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1749   Builder.SetInsertPoint(VectorPH->getTerminator());
1750   if (isa<TruncInst>(EntryVal)) {
1751     assert(Start->getType()->isIntegerTy() &&
1752            "Truncation requires an integer type");
1753     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1754     Step = Builder.CreateTrunc(Step, TruncType);
1755     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1756   }
1757 
1758   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1759   Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
1760                                       State.VF, State.Builder);
1761 
1762   // We create vector phi nodes for both integer and floating-point induction
1763   // variables. Here, we determine the kind of arithmetic we will perform.
1764   Instruction::BinaryOps AddOp;
1765   Instruction::BinaryOps MulOp;
1766   if (Step->getType()->isIntegerTy()) {
1767     AddOp = Instruction::Add;
1768     MulOp = Instruction::Mul;
1769   } else {
1770     AddOp = ID.getInductionOpcode();
1771     MulOp = Instruction::FMul;
1772   }
1773 
1774   Value *SplatVF;
1775   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1776     // The recipe has been unrolled. In that case, fetch the splat value for the
1777     // induction increment.
1778     SplatVF = State.get(SplatVFOperand);
1779   } else {
1780     // Multiply the vectorization factor by the step using integer or
1781     // floating-point arithmetic as appropriate.
1782     Type *StepType = Step->getType();
1783     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1784     if (Step->getType()->isFloatingPointTy())
1785       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1786     else
1787       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1788     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1789 
1790     // Create a vector splat to use in the induction update.
1791     SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
1792   }
1793 
1794   Builder.restoreIP(CurrIP);
1795 
1796   // We may need to add the step a number of times, depending on the unroll
1797   // factor. The last of those goes into the PHI.
1798   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1799   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1800   VecInd->setDebugLoc(getDebugLoc());
1801   State.set(this, VecInd);
1802 
1803   Instruction *LastInduction = cast<Instruction>(
1804       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1805   if (isa<TruncInst>(EntryVal))
1806     State.addMetadata(LastInduction, EntryVal);
1807   LastInduction->setDebugLoc(getDebugLoc());
1808 
1809   VecInd->addIncoming(SteppedStart, VectorPH);
1810   // Add induction update using an incorrect block temporarily. The phi node
1811   // will be fixed after VPlan execution. Note that at this point the latch
1812   // block cannot be used, as it does not exist yet.
1813   // TODO: Model increment value in VPlan, by turning the recipe into a
1814   // multi-def and a subclass of VPHeaderPHIRecipe.
1815   VecInd->addIncoming(LastInduction, VectorPH);
1816 }
1817 
1818 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1819 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1820                                           VPSlotTracker &SlotTracker) const {
1821   O << Indent;
1822   printAsOperand(O, SlotTracker);
1823   O << " = WIDEN-INDUCTION  ";
1824   printOperands(O, SlotTracker);
1825 
1826   if (auto *TI = getTruncInst())
1827     O << " (truncated to " << *TI->getType() << ")";
1828 }
1829 #endif
1830 
1831 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1832   // The step may be defined by a recipe in the preheader (e.g. if it requires
1833   // SCEV expansion), but for the canonical induction the step is required to be
1834   // 1, which is represented as live-in.
1835   if (getStepValue()->getDefiningRecipe())
1836     return false;
1837   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1838   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1839   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1840   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1841          getScalarType() == CanIV->getScalarType();
1842 }
1843 
1844 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1845 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1846                               VPSlotTracker &SlotTracker) const {
1847   O << Indent;
1848   printAsOperand(O, SlotTracker);
1849   O << " = DERIVED-IV ";
1850   getStartValue()->printAsOperand(O, SlotTracker);
1851   O << " + ";
1852   getOperand(1)->printAsOperand(O, SlotTracker);
1853   O << " * ";
1854   getStepValue()->printAsOperand(O, SlotTracker);
1855 }
1856 #endif
1857 
1858 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1859   // Fast-math-flags propagate from the original induction instruction.
1860   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1861   if (hasFastMathFlags())
1862     State.Builder.setFastMathFlags(getFastMathFlags());
1863 
1864   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1865   /// variable on which to base the steps, \p Step is the size of the step.
1866 
1867   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1868   Value *Step = State.get(getStepValue(), VPLane(0));
1869   IRBuilderBase &Builder = State.Builder;
1870 
1871   // Ensure step has the same type as that of scalar IV.
1872   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1873   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1874 
1875   // We build scalar steps for both integer and floating-point induction
1876   // variables. Here, we determine the kind of arithmetic we will perform.
1877   Instruction::BinaryOps AddOp;
1878   Instruction::BinaryOps MulOp;
1879   if (BaseIVTy->isIntegerTy()) {
1880     AddOp = Instruction::Add;
1881     MulOp = Instruction::Mul;
1882   } else {
1883     AddOp = InductionOpcode;
1884     MulOp = Instruction::FMul;
1885   }
1886 
1887   // Determine the number of scalars we need to generate for each unroll
1888   // iteration.
1889   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1890   // Compute the scalar steps and save the results in State.
1891   Type *IntStepTy =
1892       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1893   Type *VecIVTy = nullptr;
1894   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1895   if (!FirstLaneOnly && State.VF.isScalable()) {
1896     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1897     UnitStepVec =
1898         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1899     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1900     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1901   }
1902 
1903   unsigned StartLane = 0;
1904   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1905   if (State.Lane) {
1906     StartLane = State.Lane->getKnownLane();
1907     EndLane = StartLane + 1;
1908   }
1909   Value *StartIdx0 =
1910       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1911 
1912   if (!FirstLaneOnly && State.VF.isScalable()) {
1913     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1914     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1915     if (BaseIVTy->isFloatingPointTy())
1916       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1917     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1918     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1919     State.set(this, Add);
1920     // It's useful to record the lane values too for the known minimum number
1921     // of elements so we do those below. This improves the code quality when
1922     // trying to extract the first element, for example.
1923   }
1924 
1925   if (BaseIVTy->isFloatingPointTy())
1926     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1927 
1928   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1929     Value *StartIdx = Builder.CreateBinOp(
1930         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1931     // The step returned by `createStepForVF` is a runtime-evaluated value
1932     // when VF is scalable. Otherwise, it should be folded into a Constant.
1933     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1934            "Expected StartIdx to be folded to a constant when VF is not "
1935            "scalable");
1936     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1937     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1938     State.set(this, Add, VPLane(Lane));
1939   }
1940 }
1941 
1942 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1943 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1944                                   VPSlotTracker &SlotTracker) const {
1945   O << Indent;
1946   printAsOperand(O, SlotTracker);
1947   O << " = SCALAR-STEPS ";
1948   printOperands(O, SlotTracker);
1949 }
1950 #endif
1951 
1952 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1953   assert(State.VF.isVector() && "not widening");
1954   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1955   // Construct a vector GEP by widening the operands of the scalar GEP as
1956   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1957   // results in a vector of pointers when at least one operand of the GEP
1958   // is vector-typed. Thus, to keep the representation compact, we only use
1959   // vector-typed operands for loop-varying values.
1960 
1961   if (areAllOperandsInvariant()) {
1962     // If we are vectorizing, but the GEP has only loop-invariant operands,
1963     // the GEP we build (by only using vector-typed operands for
1964     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1965     // produce a vector of pointers, we need to either arbitrarily pick an
1966     // operand to broadcast, or broadcast a clone of the original GEP.
1967     // Here, we broadcast a clone of the original.
1968     //
1969     // TODO: If at some point we decide to scalarize instructions having
1970     //       loop-invariant operands, this special case will no longer be
1971     //       required. We would add the scalarization decision to
1972     //       collectLoopScalars() and teach getVectorValue() to broadcast
1973     //       the lane-zero scalar value.
1974     SmallVector<Value *> Ops;
1975     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1976       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1977 
1978     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1979                                            ArrayRef(Ops).drop_front(), "",
1980                                            getGEPNoWrapFlags());
1981     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1982     State.set(this, Splat);
1983     State.addMetadata(Splat, GEP);
1984   } else {
1985     // If the GEP has at least one loop-varying operand, we are sure to
1986     // produce a vector of pointers unless VF is scalar.
1987     // The pointer operand of the new GEP. If it's loop-invariant, we
1988     // won't broadcast it.
1989     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
1990                                          : State.get(getOperand(0));
1991 
1992     // Collect all the indices for the new GEP. If any index is
1993     // loop-invariant, we won't broadcast it.
1994     SmallVector<Value *, 4> Indices;
1995     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1996       VPValue *Operand = getOperand(I);
1997       if (isIndexLoopInvariant(I - 1))
1998         Indices.push_back(State.get(Operand, VPLane(0)));
1999       else
2000         Indices.push_back(State.get(Operand));
2001     }
2002 
2003     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2004     // but it should be a vector, otherwise.
2005     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
2006                                            Indices, "", getGEPNoWrapFlags());
2007     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2008            "NewGEP is not a pointer vector");
2009     State.set(this, NewGEP);
2010     State.addMetadata(NewGEP, GEP);
2011   }
2012 }
2013 
2014 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2015 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2016                              VPSlotTracker &SlotTracker) const {
2017   O << Indent << "WIDEN-GEP ";
2018   O << (isPointerLoopInvariant() ? "Inv" : "Var");
2019   for (size_t I = 0; I < getNumOperands() - 1; ++I)
2020     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2021 
2022   O << " ";
2023   printAsOperand(O, SlotTracker);
2024   O << " = getelementptr";
2025   printFlags(O);
2026   printOperands(O, SlotTracker);
2027 }
2028 #endif
2029 
2030 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
2031                            unsigned CurrentPart, IRBuilderBase &Builder) {
2032   // Use i32 for the gep index type when the value is constant,
2033   // or query DataLayout for a more suitable index type otherwise.
2034   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2035   return IsScalable && (IsReverse || CurrentPart > 0)
2036              ? DL.getIndexType(Builder.getPtrTy(0))
2037              : Builder.getInt32Ty();
2038 }
2039 
2040 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
2041   auto &Builder = State.Builder;
2042   State.setDebugLocFrom(getDebugLoc());
2043   unsigned CurrentPart = getUnrollPart(*this);
2044   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2045                                 CurrentPart, Builder);
2046 
2047   // The wide store needs to start at the last vector element.
2048   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2049   if (IndexTy != RunTimeVF->getType())
2050     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2051   // NumElt = -CurrentPart * RunTimeVF
2052   Value *NumElt = Builder.CreateMul(
2053       ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
2054   // LastLane = 1 - RunTimeVF
2055   Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
2056   Value *Ptr = State.get(getOperand(0), VPLane(0));
2057   Value *ResultPtr =
2058       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2059   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2060                                 getGEPNoWrapFlags());
2061 
2062   State.set(this, ResultPtr, /*IsScalar*/ true);
2063 }
2064 
2065 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2066 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2067                                          VPSlotTracker &SlotTracker) const {
2068   O << Indent;
2069   printAsOperand(O, SlotTracker);
2070   O << " = reverse-vector-pointer";
2071   printFlags(O);
2072   printOperands(O, SlotTracker);
2073 }
2074 #endif
2075 
2076 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2077   auto &Builder = State.Builder;
2078   State.setDebugLocFrom(getDebugLoc());
2079   unsigned CurrentPart = getUnrollPart(*this);
2080   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2081                                 CurrentPart, Builder);
2082   Value *Ptr = State.get(getOperand(0), VPLane(0));
2083 
2084   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2085   Value *ResultPtr =
2086       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2087 
2088   State.set(this, ResultPtr, /*IsScalar*/ true);
2089 }
2090 
2091 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2092 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2093                                   VPSlotTracker &SlotTracker) const {
2094   O << Indent;
2095   printAsOperand(O, SlotTracker);
2096   O << " = vector-pointer ";
2097 
2098   printOperands(O, SlotTracker);
2099 }
2100 #endif
2101 
2102 void VPBlendRecipe::execute(VPTransformState &State) {
2103   assert(isNormalized() && "Expected blend to be normalized!");
2104   State.setDebugLocFrom(getDebugLoc());
2105   // We know that all PHIs in non-header blocks are converted into
2106   // selects, so we don't have to worry about the insertion order and we
2107   // can just use the builder.
2108   // At this point we generate the predication tree. There may be
2109   // duplications since this is a simple recursive scan, but future
2110   // optimizations will clean it up.
2111 
2112   unsigned NumIncoming = getNumIncomingValues();
2113 
2114   // Generate a sequence of selects of the form:
2115   // SELECT(Mask3, In3,
2116   //        SELECT(Mask2, In2,
2117   //               SELECT(Mask1, In1,
2118   //                      In0)))
2119   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2120   // are essentially undef are taken from In0.
2121   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2122   Value *Result = nullptr;
2123   for (unsigned In = 0; In < NumIncoming; ++In) {
2124     // We might have single edge PHIs (blocks) - use an identity
2125     // 'select' for the first PHI operand.
2126     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2127     if (In == 0)
2128       Result = In0; // Initialize with the first incoming value.
2129     else {
2130       // Select between the current value and the previous incoming edge
2131       // based on the incoming mask.
2132       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2133       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2134     }
2135   }
2136   State.set(this, Result, OnlyFirstLaneUsed);
2137 }
2138 
2139 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2140                                            VPCostContext &Ctx) const {
2141   // Handle cases where only the first lane is used the same way as the legacy
2142   // cost model.
2143   if (vputils::onlyFirstLaneUsed(this))
2144     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2145 
2146   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2147   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2148   return (getNumIncomingValues() - 1) *
2149          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2150                                     CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2151 }
2152 
2153 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2154 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2155                           VPSlotTracker &SlotTracker) const {
2156   O << Indent << "BLEND ";
2157   printAsOperand(O, SlotTracker);
2158   O << " =";
2159   if (getNumIncomingValues() == 1) {
2160     // Not a User of any mask: not really blending, this is a
2161     // single-predecessor phi.
2162     O << " ";
2163     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2164   } else {
2165     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2166       O << " ";
2167       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2168       if (I == 0)
2169         continue;
2170       O << "/";
2171       getMask(I)->printAsOperand(O, SlotTracker);
2172     }
2173   }
2174 }
2175 #endif
2176 
2177 void VPReductionRecipe::execute(VPTransformState &State) {
2178   assert(!State.Lane && "Reduction being replicated.");
2179   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2180   RecurKind Kind = RdxDesc.getRecurrenceKind();
2181   // Propagate the fast-math flags carried by the underlying instruction.
2182   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2183   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2184   State.setDebugLocFrom(getDebugLoc());
2185   Value *NewVecOp = State.get(getVecOp());
2186   if (VPValue *Cond = getCondOp()) {
2187     Value *NewCond = State.get(Cond, State.VF.isScalar());
2188     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2189     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2190 
2191     Value *Start;
2192     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
2193       Start = RdxDesc.getRecurrenceStartValue();
2194     else
2195       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
2196                                           RdxDesc.getFastMathFlags());
2197     if (State.VF.isVector())
2198       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2199 
2200     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2201     NewVecOp = Select;
2202   }
2203   Value *NewRed;
2204   Value *NextInChain;
2205   if (IsOrdered) {
2206     if (State.VF.isVector())
2207       NewRed =
2208           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
2209     else
2210       NewRed = State.Builder.CreateBinOp(
2211           (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp);
2212     PrevInChain = NewRed;
2213     NextInChain = NewRed;
2214   } else {
2215     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2216     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
2217     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2218       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
2219                                    NewRed, PrevInChain);
2220     else
2221       NextInChain = State.Builder.CreateBinOp(
2222           (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain);
2223   }
2224   State.set(this, NextInChain, /*IsScalar*/ true);
2225 }
2226 
2227 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2228   assert(!State.Lane && "Reduction being replicated.");
2229 
2230   auto &Builder = State.Builder;
2231   // Propagate the fast-math flags carried by the underlying instruction.
2232   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2233   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2234   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2235 
2236   RecurKind Kind = RdxDesc.getRecurrenceKind();
2237   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2238   Value *VecOp = State.get(getVecOp());
2239   Value *EVL = State.get(getEVL(), VPLane(0));
2240 
2241   VectorBuilder VBuilder(Builder);
2242   VBuilder.setEVL(EVL);
2243   Value *Mask;
2244   // TODO: move the all-true mask generation into VectorBuilder.
2245   if (VPValue *CondOp = getCondOp())
2246     Mask = State.get(CondOp);
2247   else
2248     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2249   VBuilder.setMask(Mask);
2250 
2251   Value *NewRed;
2252   if (isOrdered()) {
2253     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2254   } else {
2255     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2256     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2257       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2258     else
2259       NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(),
2260                                    NewRed, Prev);
2261   }
2262   State.set(this, NewRed, /*IsScalar*/ true);
2263 }
2264 
2265 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2266                                                VPCostContext &Ctx) const {
2267   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
2268   Type *ElementTy = Ctx.Types.inferScalarType(this);
2269   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2270   unsigned Opcode = RdxDesc.getOpcode();
2271 
2272   // TODO: Support any-of and in-loop reductions.
2273   assert(
2274       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2275        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2276       "Any-of reduction not implemented in VPlan-based cost model currently.");
2277   assert(
2278       (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
2279        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2280       "In-loop reduction not implemented in VPlan-based cost model currently.");
2281 
2282   assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() &&
2283          "Inferred type and recurrence type mismatch.");
2284 
2285   // Cost = Reduction cost + BinOp cost
2286   InstructionCost Cost =
2287       Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
2288   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2289     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2290     return Cost + Ctx.TTI.getMinMaxReductionCost(
2291                       Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
2292   }
2293 
2294   return Cost + Ctx.TTI.getArithmeticReductionCost(
2295                     Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
2296 }
2297 
2298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2299 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2300                               VPSlotTracker &SlotTracker) const {
2301   O << Indent << "REDUCE ";
2302   printAsOperand(O, SlotTracker);
2303   O << " = ";
2304   getChainOp()->printAsOperand(O, SlotTracker);
2305   O << " +";
2306   if (isa<FPMathOperator>(getUnderlyingInstr()))
2307     O << getUnderlyingInstr()->getFastMathFlags();
2308   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2309   getVecOp()->printAsOperand(O, SlotTracker);
2310   if (isConditional()) {
2311     O << ", ";
2312     getCondOp()->printAsOperand(O, SlotTracker);
2313   }
2314   O << ")";
2315   if (RdxDesc.IntermediateStore)
2316     O << " (with final reduction value stored in invariant address sank "
2317          "outside of loop)";
2318 }
2319 
2320 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2321                                  VPSlotTracker &SlotTracker) const {
2322   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2323   O << Indent << "REDUCE ";
2324   printAsOperand(O, SlotTracker);
2325   O << " = ";
2326   getChainOp()->printAsOperand(O, SlotTracker);
2327   O << " +";
2328   if (isa<FPMathOperator>(getUnderlyingInstr()))
2329     O << getUnderlyingInstr()->getFastMathFlags();
2330   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2331   getVecOp()->printAsOperand(O, SlotTracker);
2332   O << ", ";
2333   getEVL()->printAsOperand(O, SlotTracker);
2334   if (isConditional()) {
2335     O << ", ";
2336     getCondOp()->printAsOperand(O, SlotTracker);
2337   }
2338   O << ")";
2339   if (RdxDesc.IntermediateStore)
2340     O << " (with final reduction value stored in invariant address sank "
2341          "outside of loop)";
2342 }
2343 #endif
2344 
2345 bool VPReplicateRecipe::shouldPack() const {
2346   // Find if the recipe is used by a widened recipe via an intervening
2347   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2348   return any_of(users(), [](const VPUser *U) {
2349     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2350       return any_of(PredR->users(), [PredR](const VPUser *U) {
2351         return !U->usesScalars(PredR);
2352       });
2353     return false;
2354   });
2355 }
2356 
2357 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2358                                                VPCostContext &Ctx) const {
2359   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2360   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2361   // transform, avoid computing their cost multiple times for now.
2362   Ctx.SkipCostComputation.insert(UI);
2363   return Ctx.getLegacyCost(UI, VF);
2364 }
2365 
2366 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2367 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2368                               VPSlotTracker &SlotTracker) const {
2369   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2370 
2371   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2372     printAsOperand(O, SlotTracker);
2373     O << " = ";
2374   }
2375   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2376     O << "call";
2377     printFlags(O);
2378     O << "@" << CB->getCalledFunction()->getName() << "(";
2379     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2380                     O, [&O, &SlotTracker](VPValue *Op) {
2381                       Op->printAsOperand(O, SlotTracker);
2382                     });
2383     O << ")";
2384   } else {
2385     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2386     printFlags(O);
2387     printOperands(O, SlotTracker);
2388   }
2389 
2390   if (shouldPack())
2391     O << " (S->V)";
2392 }
2393 #endif
2394 
2395 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2396   State.setDebugLocFrom(getDebugLoc());
2397   assert(vputils::onlyFirstLaneUsed(this) &&
2398          "Codegen only implemented for first lane.");
2399   switch (Opcode) {
2400   case Instruction::SExt:
2401   case Instruction::ZExt:
2402   case Instruction::Trunc: {
2403     // Note: SExt/ZExt not used yet.
2404     Value *Op = State.get(getOperand(0), VPLane(0));
2405     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2406   }
2407   default:
2408     llvm_unreachable("opcode not implemented yet");
2409   }
2410 }
2411 
2412 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2413   State.set(this, generate(State), VPLane(0));
2414 }
2415 
2416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2417 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2418                                 VPSlotTracker &SlotTracker) const {
2419   O << Indent << "SCALAR-CAST ";
2420   printAsOperand(O, SlotTracker);
2421   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2422   printOperands(O, SlotTracker);
2423   O << " to " << *ResultTy;
2424 }
2425 #endif
2426 
2427 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2428   assert(State.Lane && "Branch on Mask works only on single instance.");
2429 
2430   unsigned Lane = State.Lane->getKnownLane();
2431 
2432   Value *ConditionBit = nullptr;
2433   VPValue *BlockInMask = getMask();
2434   if (BlockInMask) {
2435     ConditionBit = State.get(BlockInMask);
2436     if (ConditionBit->getType()->isVectorTy())
2437       ConditionBit = State.Builder.CreateExtractElement(
2438           ConditionBit, State.Builder.getInt32(Lane));
2439   } else // Block in mask is all-one.
2440     ConditionBit = State.Builder.getTrue();
2441 
2442   // Replace the temporary unreachable terminator with a new conditional branch,
2443   // whose two destinations will be set later when they are created.
2444   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2445   assert(isa<UnreachableInst>(CurrentTerminator) &&
2446          "Expected to replace unreachable terminator with conditional branch.");
2447   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2448   CondBr->setSuccessor(0, nullptr);
2449   ReplaceInstWithInst(CurrentTerminator, CondBr);
2450 }
2451 
2452 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2453                                                   VPCostContext &Ctx) const {
2454   // The legacy cost model doesn't assign costs to branches for individual
2455   // replicate regions. Match the current behavior in the VPlan cost model for
2456   // now.
2457   return 0;
2458 }
2459 
2460 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2461   State.setDebugLocFrom(getDebugLoc());
2462   assert(State.Lane && "Predicated instruction PHI works per instance.");
2463   Instruction *ScalarPredInst =
2464       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2465   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2466   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2467   assert(PredicatingBB && "Predicated block has no single predecessor.");
2468   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2469          "operand must be VPReplicateRecipe");
2470 
2471   // By current pack/unpack logic we need to generate only a single phi node: if
2472   // a vector value for the predicated instruction exists at this point it means
2473   // the instruction has vector users only, and a phi for the vector value is
2474   // needed. In this case the recipe of the predicated instruction is marked to
2475   // also do that packing, thereby "hoisting" the insert-element sequence.
2476   // Otherwise, a phi node for the scalar value is needed.
2477   if (State.hasVectorValue(getOperand(0))) {
2478     Value *VectorValue = State.get(getOperand(0));
2479     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2480     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2481     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2482     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2483     if (State.hasVectorValue(this))
2484       State.reset(this, VPhi);
2485     else
2486       State.set(this, VPhi);
2487     // NOTE: Currently we need to update the value of the operand, so the next
2488     // predicated iteration inserts its generated value in the correct vector.
2489     State.reset(getOperand(0), VPhi);
2490   } else {
2491     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
2492       return;
2493 
2494     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2495     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2496     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2497                      PredicatingBB);
2498     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2499     if (State.hasScalarValue(this, *State.Lane))
2500       State.reset(this, Phi, *State.Lane);
2501     else
2502       State.set(this, Phi, *State.Lane);
2503     // NOTE: Currently we need to update the value of the operand, so the next
2504     // predicated iteration inserts its generated value in the correct vector.
2505     State.reset(getOperand(0), Phi, *State.Lane);
2506   }
2507 }
2508 
2509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2510 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2511                                 VPSlotTracker &SlotTracker) const {
2512   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2513   printAsOperand(O, SlotTracker);
2514   O << " = ";
2515   printOperands(O, SlotTracker);
2516 }
2517 #endif
2518 
2519 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2520                                                  VPCostContext &Ctx) const {
2521   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2522   const Align Alignment =
2523       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2524   unsigned AS =
2525       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2526 
2527   if (!Consecutive) {
2528     // TODO: Using the original IR may not be accurate.
2529     // Currently, ARM will use the underlying IR to calculate gather/scatter
2530     // instruction cost.
2531     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2532     assert(!Reverse &&
2533            "Inconsecutive memory access should not have the order.");
2534     return Ctx.TTI.getAddressComputationCost(Ty) +
2535            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2536                                           IsMasked, Alignment, Ctx.CostKind,
2537                                           &Ingredient);
2538   }
2539 
2540   InstructionCost Cost = 0;
2541   if (IsMasked) {
2542     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2543                                           AS, Ctx.CostKind);
2544   } else {
2545     TTI::OperandValueInfo OpInfo =
2546         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2547     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2548                                     Ctx.CostKind, OpInfo, &Ingredient);
2549   }
2550   if (!Reverse)
2551     return Cost;
2552 
2553   return Cost +=
2554          Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2555                                 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
2556 }
2557 
2558 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2559   auto *LI = cast<LoadInst>(&Ingredient);
2560 
2561   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2562   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2563   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2564   bool CreateGather = !isConsecutive();
2565 
2566   auto &Builder = State.Builder;
2567   State.setDebugLocFrom(getDebugLoc());
2568   Value *Mask = nullptr;
2569   if (auto *VPMask = getMask()) {
2570     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2571     // of a null all-one mask is a null mask.
2572     Mask = State.get(VPMask);
2573     if (isReverse())
2574       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2575   }
2576 
2577   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2578   Value *NewLI;
2579   if (CreateGather) {
2580     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2581                                        "wide.masked.gather");
2582   } else if (Mask) {
2583     NewLI =
2584         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2585                                  PoisonValue::get(DataTy), "wide.masked.load");
2586   } else {
2587     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2588   }
2589   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2590   State.addMetadata(NewLI, LI);
2591   if (Reverse)
2592     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2593   State.set(this, NewLI);
2594 }
2595 
2596 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2597 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2598                               VPSlotTracker &SlotTracker) const {
2599   O << Indent << "WIDEN ";
2600   printAsOperand(O, SlotTracker);
2601   O << " = load ";
2602   printOperands(O, SlotTracker);
2603 }
2604 #endif
2605 
2606 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2607 /// dependence w/o affecting the result.
2608 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2609                                      Value *EVL, const Twine &Name) {
2610   VectorType *ValTy = cast<VectorType>(Operand->getType());
2611   Value *AllTrueMask =
2612       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2613   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2614                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2615 }
2616 
2617 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2618   auto *LI = cast<LoadInst>(&Ingredient);
2619 
2620   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2621   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2622   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2623   bool CreateGather = !isConsecutive();
2624 
2625   auto &Builder = State.Builder;
2626   State.setDebugLocFrom(getDebugLoc());
2627   CallInst *NewLI;
2628   Value *EVL = State.get(getEVL(), VPLane(0));
2629   Value *Addr = State.get(getAddr(), !CreateGather);
2630   Value *Mask = nullptr;
2631   if (VPValue *VPMask = getMask()) {
2632     Mask = State.get(VPMask);
2633     if (isReverse())
2634       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2635   } else {
2636     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2637   }
2638 
2639   if (CreateGather) {
2640     NewLI =
2641         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2642                                 nullptr, "wide.masked.gather");
2643   } else {
2644     VectorBuilder VBuilder(Builder);
2645     VBuilder.setEVL(EVL).setMask(Mask);
2646     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2647         Instruction::Load, DataTy, Addr, "vp.op.load"));
2648   }
2649   NewLI->addParamAttr(
2650       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2651   State.addMetadata(NewLI, LI);
2652   Instruction *Res = NewLI;
2653   if (isReverse())
2654     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2655   State.set(this, Res);
2656 }
2657 
2658 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2659                                                   VPCostContext &Ctx) const {
2660   if (!Consecutive || IsMasked)
2661     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2662 
2663   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2664   // here because the EVL recipes using EVL to replace the tail mask. But in the
2665   // legacy model, it will always calculate the cost of mask.
2666   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2667   // don't need to compare to the legacy cost model.
2668   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2669   const Align Alignment =
2670       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2671   unsigned AS =
2672       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2673   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2674       Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
2675   if (!Reverse)
2676     return Cost;
2677 
2678   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2679                                        cast<VectorType>(Ty), {}, Ctx.CostKind,
2680                                        0);
2681 }
2682 
2683 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2684 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2685                                  VPSlotTracker &SlotTracker) const {
2686   O << Indent << "WIDEN ";
2687   printAsOperand(O, SlotTracker);
2688   O << " = vp.load ";
2689   printOperands(O, SlotTracker);
2690 }
2691 #endif
2692 
2693 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2694   auto *SI = cast<StoreInst>(&Ingredient);
2695 
2696   VPValue *StoredVPValue = getStoredValue();
2697   bool CreateScatter = !isConsecutive();
2698   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2699 
2700   auto &Builder = State.Builder;
2701   State.setDebugLocFrom(getDebugLoc());
2702 
2703   Value *Mask = nullptr;
2704   if (auto *VPMask = getMask()) {
2705     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2706     // of a null all-one mask is a null mask.
2707     Mask = State.get(VPMask);
2708     if (isReverse())
2709       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2710   }
2711 
2712   Value *StoredVal = State.get(StoredVPValue);
2713   if (isReverse()) {
2714     // If we store to reverse consecutive memory locations, then we need
2715     // to reverse the order of elements in the stored value.
2716     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2717     // We don't want to update the value in the map as it might be used in
2718     // another expression. So don't call resetVectorValue(StoredVal).
2719   }
2720   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2721   Instruction *NewSI = nullptr;
2722   if (CreateScatter)
2723     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2724   else if (Mask)
2725     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2726   else
2727     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2728   State.addMetadata(NewSI, SI);
2729 }
2730 
2731 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2732 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2733                                VPSlotTracker &SlotTracker) const {
2734   O << Indent << "WIDEN store ";
2735   printOperands(O, SlotTracker);
2736 }
2737 #endif
2738 
2739 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2740   auto *SI = cast<StoreInst>(&Ingredient);
2741 
2742   VPValue *StoredValue = getStoredValue();
2743   bool CreateScatter = !isConsecutive();
2744   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2745 
2746   auto &Builder = State.Builder;
2747   State.setDebugLocFrom(getDebugLoc());
2748 
2749   CallInst *NewSI = nullptr;
2750   Value *StoredVal = State.get(StoredValue);
2751   Value *EVL = State.get(getEVL(), VPLane(0));
2752   if (isReverse())
2753     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2754   Value *Mask = nullptr;
2755   if (VPValue *VPMask = getMask()) {
2756     Mask = State.get(VPMask);
2757     if (isReverse())
2758       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2759   } else {
2760     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2761   }
2762   Value *Addr = State.get(getAddr(), !CreateScatter);
2763   if (CreateScatter) {
2764     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2765                                     Intrinsic::vp_scatter,
2766                                     {StoredVal, Addr, Mask, EVL});
2767   } else {
2768     VectorBuilder VBuilder(Builder);
2769     VBuilder.setEVL(EVL).setMask(Mask);
2770     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2771         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2772         {StoredVal, Addr}));
2773   }
2774   NewSI->addParamAttr(
2775       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2776   State.addMetadata(NewSI, SI);
2777 }
2778 
2779 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2780                                                    VPCostContext &Ctx) const {
2781   if (!Consecutive || IsMasked)
2782     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2783 
2784   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2785   // here because the EVL recipes using EVL to replace the tail mask. But in the
2786   // legacy model, it will always calculate the cost of mask.
2787   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2788   // don't need to compare to the legacy cost model.
2789   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2790   const Align Alignment =
2791       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2792   unsigned AS =
2793       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2794   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2795       Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
2796   if (!Reverse)
2797     return Cost;
2798 
2799   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2800                                        cast<VectorType>(Ty), {}, Ctx.CostKind,
2801                                        0);
2802 }
2803 
2804 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2805 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2806                                   VPSlotTracker &SlotTracker) const {
2807   O << Indent << "WIDEN vp.store ";
2808   printOperands(O, SlotTracker);
2809 }
2810 #endif
2811 
2812 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2813                                      VectorType *DstVTy, const DataLayout &DL) {
2814   // Verify that V is a vector type with same number of elements as DstVTy.
2815   auto VF = DstVTy->getElementCount();
2816   auto *SrcVecTy = cast<VectorType>(V->getType());
2817   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2818   Type *SrcElemTy = SrcVecTy->getElementType();
2819   Type *DstElemTy = DstVTy->getElementType();
2820   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2821          "Vector elements must have same size");
2822 
2823   // Do a direct cast if element types are castable.
2824   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2825     return Builder.CreateBitOrPointerCast(V, DstVTy);
2826   }
2827   // V cannot be directly casted to desired vector type.
2828   // May happen when V is a floating point vector but DstVTy is a vector of
2829   // pointers or vice-versa. Handle this using a two-step bitcast using an
2830   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2831   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2832          "Only one type should be a pointer type");
2833   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2834          "Only one type should be a floating point type");
2835   Type *IntTy =
2836       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2837   auto *VecIntTy = VectorType::get(IntTy, VF);
2838   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2839   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2840 }
2841 
2842 /// Return a vector containing interleaved elements from multiple
2843 /// smaller input vectors.
2844 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2845                                 const Twine &Name) {
2846   unsigned Factor = Vals.size();
2847   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2848 
2849   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2850 #ifndef NDEBUG
2851   for (Value *Val : Vals)
2852     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2853 #endif
2854 
2855   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2856   // must use intrinsics to interleave.
2857   if (VecTy->isScalableTy()) {
2858     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2859     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2860                                    Vals,
2861                                    /*FMFSource=*/nullptr, Name);
2862   }
2863 
2864   // Fixed length. Start by concatenating all vectors into a wide vector.
2865   Value *WideVec = concatenateVectors(Builder, Vals);
2866 
2867   // Interleave the elements into the wide vector.
2868   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2869   return Builder.CreateShuffleVector(
2870       WideVec, createInterleaveMask(NumElts, Factor), Name);
2871 }
2872 
2873 // Try to vectorize the interleave group that \p Instr belongs to.
2874 //
2875 // E.g. Translate following interleaved load group (factor = 3):
2876 //   for (i = 0; i < N; i+=3) {
2877 //     R = Pic[i];             // Member of index 0
2878 //     G = Pic[i+1];           // Member of index 1
2879 //     B = Pic[i+2];           // Member of index 2
2880 //     ... // do something to R, G, B
2881 //   }
2882 // To:
2883 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2884 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2885 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2886 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2887 //
2888 // Or translate following interleaved store group (factor = 3):
2889 //   for (i = 0; i < N; i+=3) {
2890 //     ... do something to R, G, B
2891 //     Pic[i]   = R;           // Member of index 0
2892 //     Pic[i+1] = G;           // Member of index 1
2893 //     Pic[i+2] = B;           // Member of index 2
2894 //   }
2895 // To:
2896 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2897 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2898 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2899 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2900 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2901 void VPInterleaveRecipe::execute(VPTransformState &State) {
2902   assert(!State.Lane && "Interleave group being replicated.");
2903   const InterleaveGroup<Instruction> *Group = IG;
2904   Instruction *Instr = Group->getInsertPos();
2905 
2906   // Prepare for the vector type of the interleaved load/store.
2907   Type *ScalarTy = getLoadStoreType(Instr);
2908   unsigned InterleaveFactor = Group->getFactor();
2909   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2910 
2911   // TODO: extend the masked interleaved-group support to reversed access.
2912   VPValue *BlockInMask = getMask();
2913   assert((!BlockInMask || !Group->isReverse()) &&
2914          "Reversed masked interleave-group not supported.");
2915 
2916   VPValue *Addr = getAddr();
2917   Value *ResAddr = State.get(Addr, VPLane(0));
2918   if (auto *I = dyn_cast<Instruction>(ResAddr))
2919     State.setDebugLocFrom(I->getDebugLoc());
2920 
2921   // If the group is reverse, adjust the index to refer to the last vector lane
2922   // instead of the first. We adjust the index from the first vector lane,
2923   // rather than directly getting the pointer for lane VF - 1, because the
2924   // pointer operand of the interleaved access is supposed to be uniform.
2925   if (Group->isReverse()) {
2926     Value *RuntimeVF =
2927         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2928     Value *Index =
2929         State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2930     Index = State.Builder.CreateMul(Index,
2931                                     State.Builder.getInt32(Group->getFactor()));
2932     Index = State.Builder.CreateNeg(Index);
2933 
2934     bool InBounds = false;
2935     if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2936       InBounds = Gep->isInBounds();
2937     ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
2938   }
2939 
2940   State.setDebugLocFrom(Instr->getDebugLoc());
2941   Value *PoisonVec = PoisonValue::get(VecTy);
2942 
2943   auto CreateGroupMask = [&BlockInMask, &State,
2944                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2945     if (State.VF.isScalable()) {
2946       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2947       assert(InterleaveFactor == 2 &&
2948              "Unsupported deinterleave factor for scalable vectors");
2949       auto *ResBlockInMask = State.get(BlockInMask);
2950       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2951       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2952                                      State.VF.getKnownMinValue() * 2, true);
2953       return State.Builder.CreateIntrinsic(
2954           MaskTy, Intrinsic::vector_interleave2, Ops,
2955           /*FMFSource=*/nullptr, "interleaved.mask");
2956     }
2957 
2958     if (!BlockInMask)
2959       return MaskForGaps;
2960 
2961     Value *ResBlockInMask = State.get(BlockInMask);
2962     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2963         ResBlockInMask,
2964         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2965         "interleaved.mask");
2966     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2967                                                    ShuffledMask, MaskForGaps)
2968                        : ShuffledMask;
2969   };
2970 
2971   const DataLayout &DL = Instr->getDataLayout();
2972   // Vectorize the interleaved load group.
2973   if (isa<LoadInst>(Instr)) {
2974     Value *MaskForGaps = nullptr;
2975     if (NeedsMaskForGaps) {
2976       MaskForGaps = createBitMaskForGaps(State.Builder,
2977                                          State.VF.getKnownMinValue(), *Group);
2978       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2979     }
2980 
2981     Instruction *NewLoad;
2982     if (BlockInMask || MaskForGaps) {
2983       Value *GroupMask = CreateGroupMask(MaskForGaps);
2984       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2985                                                Group->getAlign(), GroupMask,
2986                                                PoisonVec, "wide.masked.vec");
2987     } else
2988       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2989                                                 Group->getAlign(), "wide.vec");
2990     Group->addMetadata(NewLoad);
2991 
2992     ArrayRef<VPValue *> VPDefs = definedValues();
2993     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2994     if (VecTy->isScalableTy()) {
2995       assert(InterleaveFactor == 2 &&
2996              "Unsupported deinterleave factor for scalable vectors");
2997 
2998         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2999         // so must use intrinsics to deinterleave.
3000       Value *DI = State.Builder.CreateIntrinsic(
3001           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
3002           /*FMFSource=*/nullptr, "strided.vec");
3003       unsigned J = 0;
3004       for (unsigned I = 0; I < InterleaveFactor; ++I) {
3005         Instruction *Member = Group->getMember(I);
3006 
3007         if (!Member)
3008           continue;
3009 
3010         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
3011         // If this member has different type, cast the result type.
3012         if (Member->getType() != ScalarTy) {
3013           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3014           StridedVec =
3015               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3016         }
3017 
3018         if (Group->isReverse())
3019           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3020 
3021         State.set(VPDefs[J], StridedVec);
3022         ++J;
3023       }
3024 
3025       return;
3026     }
3027 
3028     // For each member in the group, shuffle out the appropriate data from the
3029     // wide loads.
3030     unsigned J = 0;
3031     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3032       Instruction *Member = Group->getMember(I);
3033 
3034       // Skip the gaps in the group.
3035       if (!Member)
3036         continue;
3037 
3038       auto StrideMask =
3039           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
3040       Value *StridedVec =
3041           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3042 
3043       // If this member has different type, cast the result type.
3044       if (Member->getType() != ScalarTy) {
3045         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3046         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3047         StridedVec =
3048             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3049       }
3050 
3051       if (Group->isReverse())
3052         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3053 
3054       State.set(VPDefs[J], StridedVec);
3055       ++J;
3056     }
3057     return;
3058   }
3059 
3060   // The sub vector type for current instruction.
3061   auto *SubVT = VectorType::get(ScalarTy, State.VF);
3062 
3063   // Vectorize the interleaved store group.
3064   Value *MaskForGaps =
3065       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3066   assert((!MaskForGaps || !State.VF.isScalable()) &&
3067          "masking gaps for scalable vectors is not yet supported.");
3068   ArrayRef<VPValue *> StoredValues = getStoredValues();
3069   // Collect the stored vector from each member.
3070   SmallVector<Value *, 4> StoredVecs;
3071   unsigned StoredIdx = 0;
3072   for (unsigned i = 0; i < InterleaveFactor; i++) {
3073     assert((Group->getMember(i) || MaskForGaps) &&
3074            "Fail to get a member from an interleaved store group");
3075     Instruction *Member = Group->getMember(i);
3076 
3077     // Skip the gaps in the group.
3078     if (!Member) {
3079       Value *Undef = PoisonValue::get(SubVT);
3080       StoredVecs.push_back(Undef);
3081       continue;
3082     }
3083 
3084     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3085     ++StoredIdx;
3086 
3087     if (Group->isReverse())
3088       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3089 
3090     // If this member has different type, cast it to a unified type.
3091 
3092     if (StoredVec->getType() != SubVT)
3093       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3094 
3095     StoredVecs.push_back(StoredVec);
3096   }
3097 
3098   // Interleave all the smaller vectors into one wider vector.
3099   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3100   Instruction *NewStoreInstr;
3101   if (BlockInMask || MaskForGaps) {
3102     Value *GroupMask = CreateGroupMask(MaskForGaps);
3103     NewStoreInstr = State.Builder.CreateMaskedStore(
3104         IVec, ResAddr, Group->getAlign(), GroupMask);
3105   } else
3106     NewStoreInstr =
3107         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3108 
3109   Group->addMetadata(NewStoreInstr);
3110 }
3111 
3112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3113 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3114                                VPSlotTracker &SlotTracker) const {
3115   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3116   IG->getInsertPos()->printAsOperand(O, false);
3117   O << ", ";
3118   getAddr()->printAsOperand(O, SlotTracker);
3119   VPValue *Mask = getMask();
3120   if (Mask) {
3121     O << ", ";
3122     Mask->printAsOperand(O, SlotTracker);
3123   }
3124 
3125   unsigned OpIdx = 0;
3126   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3127     if (!IG->getMember(i))
3128       continue;
3129     if (getNumStoreOperands() > 0) {
3130       O << "\n" << Indent << "  store ";
3131       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3132       O << " to index " << i;
3133     } else {
3134       O << "\n" << Indent << "  ";
3135       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3136       O << " = load from index " << i;
3137     }
3138     ++OpIdx;
3139   }
3140 }
3141 #endif
3142 
3143 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3144                                                 VPCostContext &Ctx) const {
3145   Instruction *InsertPos = getInsertPos();
3146   // Find the VPValue index of the interleave group. We need to skip gaps.
3147   unsigned InsertPosIdx = 0;
3148   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3149     if (auto *Member = IG->getMember(Idx)) {
3150       if (Member == InsertPos)
3151         break;
3152       InsertPosIdx++;
3153     }
3154   Type *ValTy = Ctx.Types.inferScalarType(
3155       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3156                                 : getStoredValues()[InsertPosIdx]);
3157   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3158   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3159 
3160   unsigned InterleaveFactor = IG->getFactor();
3161   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3162 
3163   // Holds the indices of existing members in the interleaved group.
3164   SmallVector<unsigned, 4> Indices;
3165   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3166     if (IG->getMember(IF))
3167       Indices.push_back(IF);
3168 
3169   // Calculate the cost of the whole interleaved group.
3170   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3171       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3172       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
3173 
3174   if (!IG->isReverse())
3175     return Cost;
3176 
3177   return Cost + IG->getNumMembers() *
3178                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3179                                            VectorTy, std::nullopt, Ctx.CostKind,
3180                                            0);
3181 }
3182 
3183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3184 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3185                                    VPSlotTracker &SlotTracker) const {
3186   O << Indent << "EMIT ";
3187   printAsOperand(O, SlotTracker);
3188   O << " = CANONICAL-INDUCTION ";
3189   printOperands(O, SlotTracker);
3190 }
3191 #endif
3192 
3193 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3194   return IsScalarAfterVectorization &&
3195          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3196 }
3197 
3198 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3199   assert(getInductionDescriptor().getKind() ==
3200              InductionDescriptor::IK_PtrInduction &&
3201          "Not a pointer induction according to InductionDescriptor!");
3202   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
3203          "Unexpected type.");
3204   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3205          "Recipe should have been replaced");
3206 
3207   unsigned CurrentPart = getUnrollPart(*this);
3208 
3209   // Build a pointer phi
3210   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3211   Type *ScStValueType = ScalarStartValue->getType();
3212 
3213   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3214   PHINode *NewPointerPhi = nullptr;
3215   if (CurrentPart == 0) {
3216     auto *IVR = cast<VPHeaderPHIRecipe>(&getParent()
3217                                              ->getPlan()
3218                                              ->getVectorLoopRegion()
3219                                              ->getEntryBasicBlock()
3220                                              ->front());
3221     PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
3222     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
3223                                     CanonicalIV->getIterator());
3224     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3225     NewPointerPhi->setDebugLoc(getDebugLoc());
3226   } else {
3227     // The recipe has been unrolled. In that case, fetch the single pointer phi
3228     // shared among all unrolled parts of the recipe.
3229     auto *GEP =
3230         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3231     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3232   }
3233 
3234   // A pointer induction, performed by using a gep
3235   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3236   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3237   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3238   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3239   // Add induction update using an incorrect block temporarily. The phi node
3240   // will be fixed after VPlan execution. Note that at this point the latch
3241   // block cannot be used, as it does not exist yet.
3242   // TODO: Model increment value in VPlan, by turning the recipe into a
3243   // multi-def and a subclass of VPHeaderPHIRecipe.
3244   if (CurrentPart == 0) {
3245     // The recipe represents the first part of the pointer induction. Create the
3246     // GEP to increment the phi across all unrolled parts.
3247     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
3248     Value *NumUnrolledElems =
3249         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
3250 
3251     Value *InductionGEP = GetElementPtrInst::Create(
3252         State.Builder.getInt8Ty(), NewPointerPhi,
3253         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
3254         InductionLoc);
3255 
3256     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3257   }
3258 
3259   // Create actual address geps that use the pointer phi as base and a
3260   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3261   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3262   Value *StartOffsetScalar = State.Builder.CreateMul(
3263       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3264   Value *StartOffset =
3265       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3266   // Create a vector of consecutive numbers from zero to VF.
3267   StartOffset = State.Builder.CreateAdd(
3268       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3269 
3270   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3271          "scalar step must be the same across all parts");
3272   Value *GEP = State.Builder.CreateGEP(
3273       State.Builder.getInt8Ty(), NewPointerPhi,
3274       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3275                                                State.VF, ScalarStepValue)),
3276       "vector.gep");
3277   State.set(this, GEP);
3278 }
3279 
3280 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3281 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3282                                           VPSlotTracker &SlotTracker) const {
3283   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
3284          "unexpected number of operands");
3285   O << Indent << "EMIT ";
3286   printAsOperand(O, SlotTracker);
3287   O << " = WIDEN-POINTER-INDUCTION ";
3288   getStartValue()->printAsOperand(O, SlotTracker);
3289   O << ", ";
3290   getStepValue()->printAsOperand(O, SlotTracker);
3291   if (getNumOperands() == 4) {
3292     O << ", ";
3293     getOperand(2)->printAsOperand(O, SlotTracker);
3294     O << ", ";
3295     getOperand(3)->printAsOperand(O, SlotTracker);
3296   }
3297 }
3298 #endif
3299 
3300 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3301   assert(!State.Lane && "cannot be used in per-lane");
3302   if (State.ExpandedSCEVs.contains(Expr)) {
3303     // SCEV Expr has already been expanded, result must already be set. At the
3304     // moment we have to execute the entry block twice (once before skeleton
3305     // creation to get expanded SCEVs used by the skeleton and once during
3306     // regular VPlan execution).
3307     State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
3308     assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
3309            "Results must match");
3310     return;
3311   }
3312 
3313   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3314   SCEVExpander Exp(SE, DL, "induction");
3315 
3316   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3317                                  &*State.Builder.GetInsertPoint());
3318   State.ExpandedSCEVs[Expr] = Res;
3319   State.set(this, Res, VPLane(0));
3320 }
3321 
3322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3323 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3324                                VPSlotTracker &SlotTracker) const {
3325   O << Indent << "EMIT ";
3326   printAsOperand(O, SlotTracker);
3327   O << " = EXPAND SCEV " << *Expr;
3328 }
3329 #endif
3330 
3331 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3332   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3333   Type *STy = CanonicalIV->getType();
3334   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3335   ElementCount VF = State.VF;
3336   Value *VStart = VF.isScalar()
3337                       ? CanonicalIV
3338                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3339   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3340   if (VF.isVector()) {
3341     VStep = Builder.CreateVectorSplat(VF, VStep);
3342     VStep =
3343         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3344   }
3345   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3346   State.set(this, CanonicalVectorIV);
3347 }
3348 
3349 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3350 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3351                                      VPSlotTracker &SlotTracker) const {
3352   O << Indent << "EMIT ";
3353   printAsOperand(O, SlotTracker);
3354   O << " = WIDEN-CANONICAL-INDUCTION ";
3355   printOperands(O, SlotTracker);
3356 }
3357 #endif
3358 
3359 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3360   auto &Builder = State.Builder;
3361   // Create a vector from the initial value.
3362   auto *VectorInit = getStartValue()->getLiveInIRValue();
3363 
3364   Type *VecTy = State.VF.isScalar()
3365                     ? VectorInit->getType()
3366                     : VectorType::get(VectorInit->getType(), State.VF);
3367 
3368   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3369   if (State.VF.isVector()) {
3370     auto *IdxTy = Builder.getInt32Ty();
3371     auto *One = ConstantInt::get(IdxTy, 1);
3372     IRBuilder<>::InsertPointGuard Guard(Builder);
3373     Builder.SetInsertPoint(VectorPH->getTerminator());
3374     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3375     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3376     VectorInit = Builder.CreateInsertElement(
3377         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3378   }
3379 
3380   // Create a phi node for the new recurrence.
3381   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3382   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3383   Phi->addIncoming(VectorInit, VectorPH);
3384   State.set(this, Phi);
3385 }
3386 
3387 InstructionCost
3388 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3389                                              VPCostContext &Ctx) const {
3390   if (VF.isScalar())
3391     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3392 
3393   if (VF.isScalable() && VF.getKnownMinValue() == 1)
3394     return InstructionCost::getInvalid();
3395 
3396   SmallVector<int> Mask(VF.getKnownMinValue());
3397   std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
3398   Type *VectorTy =
3399       toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
3400 
3401   return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
3402                                 cast<VectorType>(VectorTy), Mask, Ctx.CostKind,
3403                                 VF.getKnownMinValue() - 1);
3404 }
3405 
3406 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3407 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3408                                             VPSlotTracker &SlotTracker) const {
3409   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3410   printAsOperand(O, SlotTracker);
3411   O << " = phi ";
3412   printOperands(O, SlotTracker);
3413 }
3414 #endif
3415 
3416 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3417   auto &Builder = State.Builder;
3418 
3419   // If this phi is fed by a scaled reduction then it should output a
3420   // vector with fewer elements than the VF.
3421   ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
3422 
3423   // Reductions do not have to start at zero. They can start with
3424   // any loop invariant values.
3425   VPValue *StartVPV = getStartValue();
3426   Value *StartV = StartVPV->getLiveInIRValue();
3427 
3428   // In order to support recurrences we need to be able to vectorize Phi nodes.
3429   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3430   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3431   // this value when we vectorize all of the instructions that use the PHI.
3432   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3433   Type *VecTy =
3434       ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
3435 
3436   BasicBlock *HeaderBB = State.CFG.PrevBB;
3437   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3438          "recipe must be in the vector loop header");
3439   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3440   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3441   State.set(this, Phi, IsInLoop);
3442 
3443   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3444 
3445   Value *Iden = nullptr;
3446   RecurKind RK = RdxDesc.getRecurrenceKind();
3447   unsigned CurrentPart = getUnrollPart(*this);
3448 
3449   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3450       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3451     // MinMax and AnyOf reductions have the start value as their identity.
3452     if (ScalarPHI) {
3453       Iden = StartV;
3454     } else {
3455       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3456       Builder.SetInsertPoint(VectorPH->getTerminator());
3457       StartV = Iden = State.get(StartVPV);
3458     }
3459   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
3460     // [I|F]FindLastIV will use a sentinel value to initialize the reduction
3461     // phi or the resume value from the main vector loop when vectorizing the
3462     // epilogue loop. In the exit block, ComputeReductionResult will generate
3463     // checks to verify if the reduction result is the sentinel value. If the
3464     // result is the sentinel value, it will be corrected back to the start
3465     // value.
3466     // TODO: The sentinel value is not always necessary. When the start value is
3467     // a constant, and smaller than the start value of the induction variable,
3468     // the start value can be directly used to initialize the reduction phi.
3469     Iden = StartV;
3470     if (!ScalarPHI) {
3471       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3472       Builder.SetInsertPoint(VectorPH->getTerminator());
3473       StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
3474     }
3475   } else {
3476     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3477                                        RdxDesc.getFastMathFlags());
3478 
3479     if (!ScalarPHI) {
3480       if (CurrentPart == 0) {
3481         // Create start and identity vector values for the reduction in the
3482         // preheader.
3483         // TODO: Introduce recipes in VPlan preheader to create initial values.
3484         Iden = Builder.CreateVectorSplat(VF, Iden);
3485         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3486         Builder.SetInsertPoint(VectorPH->getTerminator());
3487         Constant *Zero = Builder.getInt32(0);
3488         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3489       } else {
3490         Iden = Builder.CreateVectorSplat(VF, Iden);
3491       }
3492     }
3493   }
3494 
3495   Phi = cast<PHINode>(State.get(this, IsInLoop));
3496   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3497   Phi->addIncoming(StartVal, VectorPH);
3498 }
3499 
3500 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3501 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3502                                  VPSlotTracker &SlotTracker) const {
3503   O << Indent << "WIDEN-REDUCTION-PHI ";
3504 
3505   printAsOperand(O, SlotTracker);
3506   O << " = phi ";
3507   printOperands(O, SlotTracker);
3508   if (VFScaleFactor != 1)
3509     O << " (VF scaled by 1/" << VFScaleFactor << ")";
3510 }
3511 #endif
3512 
3513 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3514   assert(EnableVPlanNativePath &&
3515          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3516 
3517   State.setDebugLocFrom(getDebugLoc());
3518   Value *Op0 = State.get(getOperand(0));
3519   Type *VecTy = Op0->getType();
3520   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3521   State.set(this, VecPhi);
3522 }
3523 
3524 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3525 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3526                              VPSlotTracker &SlotTracker) const {
3527   O << Indent << "WIDEN-PHI ";
3528 
3529   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3530   // Unless all incoming values are modeled in VPlan  print the original PHI
3531   // directly.
3532   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3533   // values as VPValues.
3534   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3535     O << VPlanIngredient(OriginalPhi);
3536     return;
3537   }
3538 
3539   printAsOperand(O, SlotTracker);
3540   O << " = phi ";
3541   printOperands(O, SlotTracker);
3542 }
3543 #endif
3544 
3545 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3546 // remove VPActiveLaneMaskPHIRecipe.
3547 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3548   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3549   Value *StartMask = State.get(getOperand(0));
3550   PHINode *Phi =
3551       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3552   Phi->addIncoming(StartMask, VectorPH);
3553   Phi->setDebugLoc(getDebugLoc());
3554   State.set(this, Phi);
3555 }
3556 
3557 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3558 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3559                                       VPSlotTracker &SlotTracker) const {
3560   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3561 
3562   printAsOperand(O, SlotTracker);
3563   O << " = phi ";
3564   printOperands(O, SlotTracker);
3565 }
3566 #endif
3567 
3568 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3569 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3570                                   VPSlotTracker &SlotTracker) const {
3571   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3572 
3573   printAsOperand(O, SlotTracker);
3574   O << " = phi ";
3575   printOperands(O, SlotTracker);
3576 }
3577 #endif
3578 
3579 void VPScalarPHIRecipe::execute(VPTransformState &State) {
3580   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3581   Value *Start = State.get(getStartValue(), VPLane(0));
3582   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
3583   Phi->addIncoming(Start, VectorPH);
3584   Phi->setDebugLoc(getDebugLoc());
3585   State.set(this, Phi, /*IsScalar=*/true);
3586 }
3587 
3588 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3589 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3590                               VPSlotTracker &SlotTracker) const {
3591   O << Indent << "SCALAR-PHI ";
3592   printAsOperand(O, SlotTracker);
3593   O << " = phi ";
3594   printOperands(O, SlotTracker);
3595 }
3596 #endif
3597