xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 632317e9ab5548e991d8974954353033bea62a5b)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenStoreEVLSC:
51   case VPWidenStoreSC:
52     return true;
53   case VPReplicateSC:
54     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
55         ->mayWriteToMemory();
56   case VPWidenCallSC:
57     return !cast<VPWidenCallRecipe>(this)
58                 ->getCalledScalarFunction()
59                 ->onlyReadsMemory();
60   case VPBranchOnMaskSC:
61   case VPScalarIVStepsSC:
62   case VPPredInstPHISC:
63     return false;
64   case VPBlendSC:
65   case VPReductionSC:
66   case VPWidenCanonicalIVSC:
67   case VPWidenCastSC:
68   case VPWidenGEPSC:
69   case VPWidenIntOrFpInductionSC:
70   case VPWidenLoadEVLSC:
71   case VPWidenLoadSC:
72   case VPWidenPHISC:
73   case VPWidenSC:
74   case VPWidenSelectSC: {
75     const Instruction *I =
76         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
77     (void)I;
78     assert((!I || !I->mayWriteToMemory()) &&
79            "underlying instruction may write to memory");
80     return false;
81   }
82   default:
83     return true;
84   }
85 }
86 
87 bool VPRecipeBase::mayReadFromMemory() const {
88   switch (getVPDefID()) {
89   case VPWidenLoadEVLSC:
90   case VPWidenLoadSC:
91     return true;
92   case VPReplicateSC:
93     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
94         ->mayReadFromMemory();
95   case VPWidenCallSC:
96     return !cast<VPWidenCallRecipe>(this)
97                 ->getCalledScalarFunction()
98                 ->onlyWritesMemory();
99   case VPBranchOnMaskSC:
100   case VPPredInstPHISC:
101   case VPScalarIVStepsSC:
102   case VPWidenStoreEVLSC:
103   case VPWidenStoreSC:
104     return false;
105   case VPBlendSC:
106   case VPReductionSC:
107   case VPWidenCanonicalIVSC:
108   case VPWidenCastSC:
109   case VPWidenGEPSC:
110   case VPWidenIntOrFpInductionSC:
111   case VPWidenPHISC:
112   case VPWidenSC:
113   case VPWidenSelectSC: {
114     const Instruction *I =
115         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
116     (void)I;
117     assert((!I || !I->mayReadFromMemory()) &&
118            "underlying instruction may read from memory");
119     return false;
120   }
121   default:
122     return true;
123   }
124 }
125 
126 bool VPRecipeBase::mayHaveSideEffects() const {
127   switch (getVPDefID()) {
128   case VPDerivedIVSC:
129   case VPPredInstPHISC:
130   case VPScalarCastSC:
131     return false;
132   case VPInstructionSC:
133     switch (cast<VPInstruction>(this)->getOpcode()) {
134     case Instruction::Or:
135     case Instruction::ICmp:
136     case Instruction::Select:
137     case VPInstruction::Not:
138     case VPInstruction::CalculateTripCountMinusVF:
139     case VPInstruction::CanonicalIVIncrementForPart:
140     case VPInstruction::LogicalAnd:
141     case VPInstruction::PtrAdd:
142       return false;
143     default:
144       return true;
145     }
146   case VPWidenCallSC: {
147     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
148     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
149   }
150   case VPBlendSC:
151   case VPReductionSC:
152   case VPScalarIVStepsSC:
153   case VPWidenCanonicalIVSC:
154   case VPWidenCastSC:
155   case VPWidenGEPSC:
156   case VPWidenIntOrFpInductionSC:
157   case VPWidenPHISC:
158   case VPWidenPointerInductionSC:
159   case VPWidenSC:
160   case VPWidenSelectSC: {
161     const Instruction *I =
162         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
163     (void)I;
164     assert((!I || !I->mayHaveSideEffects()) &&
165            "underlying instruction has side-effects");
166     return false;
167   }
168   case VPInterleaveSC:
169     return mayWriteToMemory();
170   case VPWidenLoadEVLSC:
171   case VPWidenLoadSC:
172   case VPWidenStoreEVLSC:
173   case VPWidenStoreSC:
174     assert(
175         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
176             mayWriteToMemory() &&
177         "mayHaveSideffects result for ingredient differs from this "
178         "implementation");
179     return mayWriteToMemory();
180   case VPReplicateSC: {
181     auto *R = cast<VPReplicateRecipe>(this);
182     return R->getUnderlyingInstr()->mayHaveSideEffects();
183   }
184   default:
185     return true;
186   }
187 }
188 
189 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
190   auto Lane = VPLane::getLastLaneForVF(State.VF);
191   VPValue *ExitValue = getOperand(0);
192   if (vputils::isUniformAfterVectorization(ExitValue))
193     Lane = VPLane::getFirstLane();
194   VPBasicBlock *MiddleVPBB =
195       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
196   assert(MiddleVPBB->getNumSuccessors() == 0 &&
197          "the middle block must not have any successors");
198   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
199   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
200                    MiddleBB);
201 }
202 
203 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
204 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
205   O << "Live-out ";
206   getPhi()->printAsOperand(O);
207   O << " = ";
208   getOperand(0)->printAsOperand(O, SlotTracker);
209   O << "\n";
210 }
211 #endif
212 
213 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
214   assert(!Parent && "Recipe already in some VPBasicBlock");
215   assert(InsertPos->getParent() &&
216          "Insertion position not in any VPBasicBlock");
217   InsertPos->getParent()->insert(this, InsertPos->getIterator());
218 }
219 
220 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
221                                 iplist<VPRecipeBase>::iterator I) {
222   assert(!Parent && "Recipe already in some VPBasicBlock");
223   assert(I == BB.end() || I->getParent() == &BB);
224   BB.insert(this, I);
225 }
226 
227 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
228   assert(!Parent && "Recipe already in some VPBasicBlock");
229   assert(InsertPos->getParent() &&
230          "Insertion position not in any VPBasicBlock");
231   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
232 }
233 
234 void VPRecipeBase::removeFromParent() {
235   assert(getParent() && "Recipe not in any VPBasicBlock");
236   getParent()->getRecipeList().remove(getIterator());
237   Parent = nullptr;
238 }
239 
240 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
241   assert(getParent() && "Recipe not in any VPBasicBlock");
242   return getParent()->getRecipeList().erase(getIterator());
243 }
244 
245 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
246   removeFromParent();
247   insertAfter(InsertPos);
248 }
249 
250 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
251                               iplist<VPRecipeBase>::iterator I) {
252   removeFromParent();
253   insertBefore(BB, I);
254 }
255 
256 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
257   assert(OpType == OperationType::FPMathOp &&
258          "recipe doesn't have fast math flags");
259   FastMathFlags Res;
260   Res.setAllowReassoc(FMFs.AllowReassoc);
261   Res.setNoNaNs(FMFs.NoNaNs);
262   Res.setNoInfs(FMFs.NoInfs);
263   Res.setNoSignedZeros(FMFs.NoSignedZeros);
264   Res.setAllowReciprocal(FMFs.AllowReciprocal);
265   Res.setAllowContract(FMFs.AllowContract);
266   Res.setApproxFunc(FMFs.ApproxFunc);
267   return Res;
268 }
269 
270 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
271                              VPValue *A, VPValue *B, DebugLoc DL,
272                              const Twine &Name)
273     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
274                           Pred, DL),
275       Opcode(Opcode), Name(Name.str()) {
276   assert(Opcode == Instruction::ICmp &&
277          "only ICmp predicates supported at the moment");
278 }
279 
280 VPInstruction::VPInstruction(unsigned Opcode,
281                              std::initializer_list<VPValue *> Operands,
282                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
283     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
284       Opcode(Opcode), Name(Name.str()) {
285   // Make sure the VPInstruction is a floating-point operation.
286   assert(isFPMathOp() && "this op can't take fast-math flags");
287 }
288 
289 bool VPInstruction::doesGeneratePerAllLanes() const {
290   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
291 }
292 
293 bool VPInstruction::canGenerateScalarForFirstLane() const {
294   if (Instruction::isBinaryOp(getOpcode()))
295     return true;
296 
297   switch (Opcode) {
298   case VPInstruction::BranchOnCond:
299   case VPInstruction::BranchOnCount:
300   case VPInstruction::CalculateTripCountMinusVF:
301   case VPInstruction::CanonicalIVIncrementForPart:
302   case VPInstruction::ComputeReductionResult:
303   case VPInstruction::PtrAdd:
304   case VPInstruction::ExplicitVectorLength:
305     return true;
306   default:
307     return false;
308   }
309 }
310 
311 Value *VPInstruction::generatePerLane(VPTransformState &State,
312                                       const VPIteration &Lane) {
313   IRBuilderBase &Builder = State.Builder;
314 
315   assert(getOpcode() == VPInstruction::PtrAdd &&
316          "only PtrAdd opcodes are supported for now");
317   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
318                               State.get(getOperand(1), Lane), Name);
319 }
320 
321 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
322   IRBuilderBase &Builder = State.Builder;
323 
324   if (Instruction::isBinaryOp(getOpcode())) {
325     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
326     if (Part != 0 && vputils::onlyFirstPartUsed(this))
327       return State.get(this, 0, OnlyFirstLaneUsed);
328 
329     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
330     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
331     auto *Res =
332         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
333     if (auto *I = dyn_cast<Instruction>(Res))
334       setFlags(I);
335     return Res;
336   }
337 
338   switch (getOpcode()) {
339   case VPInstruction::Not: {
340     Value *A = State.get(getOperand(0), Part);
341     return Builder.CreateNot(A, Name);
342   }
343   case Instruction::ICmp: {
344     Value *A = State.get(getOperand(0), Part);
345     Value *B = State.get(getOperand(1), Part);
346     return Builder.CreateCmp(getPredicate(), A, B, Name);
347   }
348   case Instruction::Select: {
349     Value *Cond = State.get(getOperand(0), Part);
350     Value *Op1 = State.get(getOperand(1), Part);
351     Value *Op2 = State.get(getOperand(2), Part);
352     return Builder.CreateSelect(Cond, Op1, Op2, Name);
353   }
354   case VPInstruction::ActiveLaneMask: {
355     // Get first lane of vector induction variable.
356     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
357     // Get the original loop tripcount.
358     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
359 
360     // If this part of the active lane mask is scalar, generate the CMP directly
361     // to avoid unnecessary extracts.
362     if (State.VF.isScalar())
363       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
364                                Name);
365 
366     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
367     auto *PredTy = VectorType::get(Int1Ty, State.VF);
368     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
369                                    {PredTy, ScalarTC->getType()},
370                                    {VIVElem0, ScalarTC}, nullptr, Name);
371   }
372   case VPInstruction::FirstOrderRecurrenceSplice: {
373     // Generate code to combine the previous and current values in vector v3.
374     //
375     //   vector.ph:
376     //     v_init = vector(..., ..., ..., a[-1])
377     //     br vector.body
378     //
379     //   vector.body
380     //     i = phi [0, vector.ph], [i+4, vector.body]
381     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
382     //     v2 = a[i, i+1, i+2, i+3];
383     //     v3 = vector(v1(3), v2(0, 1, 2))
384 
385     // For the first part, use the recurrence phi (v1), otherwise v2.
386     auto *V1 = State.get(getOperand(0), 0);
387     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
388     if (!PartMinus1->getType()->isVectorTy())
389       return PartMinus1;
390     Value *V2 = State.get(getOperand(1), Part);
391     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
392   }
393   case VPInstruction::CalculateTripCountMinusVF: {
394     if (Part != 0)
395       return State.get(this, 0, /*IsScalar*/ true);
396 
397     Value *ScalarTC = State.get(getOperand(0), {0, 0});
398     Value *Step =
399         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
400     Value *Sub = Builder.CreateSub(ScalarTC, Step);
401     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
402     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
403     return Builder.CreateSelect(Cmp, Sub, Zero);
404   }
405   case VPInstruction::ExplicitVectorLength: {
406     // Compute EVL
407     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
408       assert(AVL->getType()->isIntegerTy() &&
409              "Requested vector length should be an integer.");
410 
411       // TODO: Add support for MaxSafeDist for correct loop emission.
412       assert(State.VF.isScalable() && "Expected scalable vector factor.");
413       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
414 
415       Value *EVL = State.Builder.CreateIntrinsic(
416           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
417           {AVL, VFArg, State.Builder.getTrue()});
418       return EVL;
419     };
420     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
421     // be outside of the main loop.
422     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
423     // Compute VTC - IV as the AVL (requested vector length).
424     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
425     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
426     Value *AVL = State.Builder.CreateSub(TripCount, Index);
427     Value *EVL = GetEVL(State, AVL);
428     return EVL;
429   }
430   case VPInstruction::CanonicalIVIncrementForPart: {
431     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
432     if (Part == 0)
433       return IV;
434 
435     // The canonical IV is incremented by the vectorization factor (num of SIMD
436     // elements) times the unroll part.
437     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
438     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
439                              hasNoSignedWrap());
440   }
441   case VPInstruction::BranchOnCond: {
442     if (Part != 0)
443       return nullptr;
444 
445     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
446     VPRegionBlock *ParentRegion = getParent()->getParent();
447     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
448 
449     // Replace the temporary unreachable terminator with a new conditional
450     // branch, hooking it up to backward destination for exiting blocks now and
451     // to forward destination(s) later when they are created.
452     BranchInst *CondBr =
453         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
454 
455     if (getParent()->isExiting())
456       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
457 
458     CondBr->setSuccessor(0, nullptr);
459     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
460     return CondBr;
461   }
462   case VPInstruction::BranchOnCount: {
463     if (Part != 0)
464       return nullptr;
465     // First create the compare.
466     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
467     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
468     Value *Cond = Builder.CreateICmpEQ(IV, TC);
469 
470     // Now create the branch.
471     auto *Plan = getParent()->getPlan();
472     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
473     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
474 
475     // Replace the temporary unreachable terminator with a new conditional
476     // branch, hooking it up to backward destination (the header) now and to the
477     // forward destination (the exit/middle block) later when it is created.
478     // Note that CreateCondBr expects a valid BB as first argument, so we need
479     // to set it to nullptr later.
480     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
481                                               State.CFG.VPBB2IRBB[Header]);
482     CondBr->setSuccessor(0, nullptr);
483     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
484     return CondBr;
485   }
486   case VPInstruction::ComputeReductionResult: {
487     if (Part != 0)
488       return State.get(this, 0, /*IsScalar*/ true);
489 
490     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
491     // and will be removed by breaking up the recipe further.
492     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
493     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
494     // Get its reduction variable descriptor.
495     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
496 
497     RecurKind RK = RdxDesc.getRecurrenceKind();
498 
499     VPValue *LoopExitingDef = getOperand(1);
500     Type *PhiTy = OrigPhi->getType();
501     VectorParts RdxParts(State.UF);
502     for (unsigned Part = 0; Part < State.UF; ++Part)
503       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
504 
505     // If the vector reduction can be performed in a smaller type, we truncate
506     // then extend the loop exit value to enable InstCombine to evaluate the
507     // entire expression in the smaller type.
508     // TODO: Handle this in truncateToMinBW.
509     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
510       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
511       for (unsigned Part = 0; Part < State.UF; ++Part)
512         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
513     }
514     // Reduce all of the unrolled parts into a single vector.
515     Value *ReducedPartRdx = RdxParts[0];
516     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
517     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
518       Op = Instruction::Or;
519 
520     if (PhiR->isOrdered()) {
521       ReducedPartRdx = RdxParts[State.UF - 1];
522     } else {
523       // Floating-point operations should have some FMF to enable the reduction.
524       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
525       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
526       for (unsigned Part = 1; Part < State.UF; ++Part) {
527         Value *RdxPart = RdxParts[Part];
528         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
529           ReducedPartRdx = Builder.CreateBinOp(
530               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
531         else
532           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
533       }
534     }
535 
536     // Create the reduction after the loop. Note that inloop reductions create
537     // the target reduction in the loop using a Reduction recipe.
538     if ((State.VF.isVector() ||
539          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
540         !PhiR->isInLoop()) {
541       ReducedPartRdx =
542           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
543       // If the reduction can be performed in a smaller type, we need to extend
544       // the reduction to the wider type before we branch to the original loop.
545       if (PhiTy != RdxDesc.getRecurrenceType())
546         ReducedPartRdx = RdxDesc.isSigned()
547                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
548                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
549     }
550 
551     // If there were stores of the reduction value to a uniform memory address
552     // inside the loop, create the final store here.
553     if (StoreInst *SI = RdxDesc.IntermediateStore) {
554       auto *NewSI = Builder.CreateAlignedStore(
555           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
556       propagateMetadata(NewSI, SI);
557     }
558 
559     return ReducedPartRdx;
560   }
561   case VPInstruction::LogicalAnd: {
562     Value *A = State.get(getOperand(0), Part);
563     Value *B = State.get(getOperand(1), Part);
564     return Builder.CreateLogicalAnd(A, B, Name);
565   }
566   case VPInstruction::PtrAdd: {
567     assert(vputils::onlyFirstLaneUsed(this) &&
568            "can only generate first lane for PtrAdd");
569     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
570     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
571     return Builder.CreatePtrAdd(Ptr, Addend, Name);
572   }
573   default:
574     llvm_unreachable("Unsupported opcode for instruction");
575   }
576 }
577 
578 #if !defined(NDEBUG)
579 bool VPInstruction::isFPMathOp() const {
580   // Inspired by FPMathOperator::classof. Notable differences are that we don't
581   // support Call, PHI and Select opcodes here yet.
582   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
583          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
584          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
585          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
586 }
587 #endif
588 
589 void VPInstruction::execute(VPTransformState &State) {
590   assert(!State.Instance && "VPInstruction executing an Instance");
591   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
592   assert((hasFastMathFlags() == isFPMathOp() ||
593           getOpcode() == Instruction::Select) &&
594          "Recipe not a FPMathOp but has fast-math flags?");
595   if (hasFastMathFlags())
596     State.Builder.setFastMathFlags(getFastMathFlags());
597   State.setDebugLocFrom(getDebugLoc());
598   bool GeneratesPerFirstLaneOnly =
599       canGenerateScalarForFirstLane() &&
600       (vputils::onlyFirstLaneUsed(this) ||
601        getOpcode() == VPInstruction::ComputeReductionResult);
602   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
603   for (unsigned Part = 0; Part < State.UF; ++Part) {
604     if (GeneratesPerAllLanes) {
605       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
606            Lane != NumLanes; ++Lane) {
607         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
608         assert(GeneratedValue && "generatePerLane must produce a value");
609         State.set(this, GeneratedValue, VPIteration(Part, Lane));
610       }
611       continue;
612     }
613 
614     Value *GeneratedValue = generatePerPart(State, Part);
615     if (!hasResult())
616       continue;
617     assert(GeneratedValue && "generatePerPart must produce a value");
618     assert((GeneratedValue->getType()->isVectorTy() ==
619                 !GeneratesPerFirstLaneOnly ||
620             State.VF.isScalar()) &&
621            "scalar value but not only first lane defined");
622     State.set(this, GeneratedValue, Part,
623               /*IsScalar*/ GeneratesPerFirstLaneOnly);
624   }
625 }
626 
627 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
628   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
629   if (Instruction::isBinaryOp(getOpcode()))
630     return vputils::onlyFirstLaneUsed(this);
631 
632   switch (getOpcode()) {
633   default:
634     return false;
635   case Instruction::ICmp:
636   case VPInstruction::PtrAdd:
637     // TODO: Cover additional opcodes.
638     return vputils::onlyFirstLaneUsed(this);
639   case VPInstruction::ActiveLaneMask:
640   case VPInstruction::ExplicitVectorLength:
641   case VPInstruction::CalculateTripCountMinusVF:
642   case VPInstruction::CanonicalIVIncrementForPart:
643   case VPInstruction::BranchOnCount:
644     return true;
645   };
646   llvm_unreachable("switch should return");
647 }
648 
649 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
650 void VPInstruction::dump() const {
651   VPSlotTracker SlotTracker(getParent()->getPlan());
652   print(dbgs(), "", SlotTracker);
653 }
654 
655 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
656                           VPSlotTracker &SlotTracker) const {
657   O << Indent << "EMIT ";
658 
659   if (hasResult()) {
660     printAsOperand(O, SlotTracker);
661     O << " = ";
662   }
663 
664   switch (getOpcode()) {
665   case VPInstruction::Not:
666     O << "not";
667     break;
668   case VPInstruction::SLPLoad:
669     O << "combined load";
670     break;
671   case VPInstruction::SLPStore:
672     O << "combined store";
673     break;
674   case VPInstruction::ActiveLaneMask:
675     O << "active lane mask";
676     break;
677   case VPInstruction::ExplicitVectorLength:
678     O << "EXPLICIT-VECTOR-LENGTH";
679     break;
680   case VPInstruction::FirstOrderRecurrenceSplice:
681     O << "first-order splice";
682     break;
683   case VPInstruction::BranchOnCond:
684     O << "branch-on-cond";
685     break;
686   case VPInstruction::CalculateTripCountMinusVF:
687     O << "TC > VF ? TC - VF : 0";
688     break;
689   case VPInstruction::CanonicalIVIncrementForPart:
690     O << "VF * Part +";
691     break;
692   case VPInstruction::BranchOnCount:
693     O << "branch-on-count";
694     break;
695   case VPInstruction::ComputeReductionResult:
696     O << "compute-reduction-result";
697     break;
698   case VPInstruction::LogicalAnd:
699     O << "logical-and";
700     break;
701   case VPInstruction::PtrAdd:
702     O << "ptradd";
703     break;
704   default:
705     O << Instruction::getOpcodeName(getOpcode());
706   }
707 
708   printFlags(O);
709   printOperands(O, SlotTracker);
710 
711   if (auto DL = getDebugLoc()) {
712     O << ", !dbg ";
713     DL.print(O);
714   }
715 }
716 #endif
717 
718 void VPWidenCallRecipe::execute(VPTransformState &State) {
719   assert(State.VF.isVector() && "not widening");
720   Function *CalledScalarFn = getCalledScalarFunction();
721   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
722          "DbgInfoIntrinsic should have been dropped during VPlan construction");
723   State.setDebugLocFrom(getDebugLoc());
724 
725   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
726   FunctionType *VFTy = nullptr;
727   if (Variant)
728     VFTy = Variant->getFunctionType();
729   for (unsigned Part = 0; Part < State.UF; ++Part) {
730     SmallVector<Type *, 2> TysForDecl;
731     // Add return type if intrinsic is overloaded on it.
732     if (UseIntrinsic &&
733         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
734       TysForDecl.push_back(VectorType::get(
735           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
736     SmallVector<Value *, 4> Args;
737     for (const auto &I : enumerate(arg_operands())) {
738       // Some intrinsics have a scalar argument - don't replace it with a
739       // vector.
740       Value *Arg;
741       if (UseIntrinsic &&
742           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
743         Arg = State.get(I.value(), VPIteration(0, 0));
744       // Some vectorized function variants may also take a scalar argument,
745       // e.g. linear parameters for pointers. This needs to be the scalar value
746       // from the start of the respective part when interleaving.
747       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
748         Arg = State.get(I.value(), VPIteration(Part, 0));
749       else
750         Arg = State.get(I.value(), Part);
751       if (UseIntrinsic &&
752           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
753         TysForDecl.push_back(Arg->getType());
754       Args.push_back(Arg);
755     }
756 
757     Function *VectorF;
758     if (UseIntrinsic) {
759       // Use vector version of the intrinsic.
760       Module *M = State.Builder.GetInsertBlock()->getModule();
761       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
762       assert(VectorF && "Can't retrieve vector intrinsic.");
763     } else {
764 #ifndef NDEBUG
765       assert(Variant != nullptr && "Can't create vector function.");
766 #endif
767       VectorF = Variant;
768     }
769 
770     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
771     SmallVector<OperandBundleDef, 1> OpBundles;
772     if (CI)
773       CI->getOperandBundlesAsDefs(OpBundles);
774 
775     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
776 
777     if (isa<FPMathOperator>(V))
778       V->copyFastMathFlags(CI);
779 
780     if (!V->getType()->isVoidTy())
781       State.set(this, V, Part);
782     State.addMetadata(V, CI);
783   }
784 }
785 
786 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
787 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
788                               VPSlotTracker &SlotTracker) const {
789   O << Indent << "WIDEN-CALL ";
790 
791   Function *CalledFn = getCalledScalarFunction();
792   if (CalledFn->getReturnType()->isVoidTy())
793     O << "void ";
794   else {
795     printAsOperand(O, SlotTracker);
796     O << " = ";
797   }
798 
799   O << "call @" << CalledFn->getName() << "(";
800   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
801     Op->printAsOperand(O, SlotTracker);
802   });
803   O << ")";
804 
805   if (VectorIntrinsicID)
806     O << " (using vector intrinsic)";
807   else {
808     O << " (using library function";
809     if (Variant->hasName())
810       O << ": " << Variant->getName();
811     O << ")";
812   }
813 }
814 
815 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
816                                 VPSlotTracker &SlotTracker) const {
817   O << Indent << "WIDEN-SELECT ";
818   printAsOperand(O, SlotTracker);
819   O << " = select ";
820   getOperand(0)->printAsOperand(O, SlotTracker);
821   O << ", ";
822   getOperand(1)->printAsOperand(O, SlotTracker);
823   O << ", ";
824   getOperand(2)->printAsOperand(O, SlotTracker);
825   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
826 }
827 #endif
828 
829 void VPWidenSelectRecipe::execute(VPTransformState &State) {
830   State.setDebugLocFrom(getDebugLoc());
831 
832   // The condition can be loop invariant but still defined inside the
833   // loop. This means that we can't just use the original 'cond' value.
834   // We have to take the 'vectorized' value and pick the first lane.
835   // Instcombine will make this a no-op.
836   auto *InvarCond =
837       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
838 
839   for (unsigned Part = 0; Part < State.UF; ++Part) {
840     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
841     Value *Op0 = State.get(getOperand(1), Part);
842     Value *Op1 = State.get(getOperand(2), Part);
843     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
844     State.set(this, Sel, Part);
845     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
846   }
847 }
848 
849 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
850     const FastMathFlags &FMF) {
851   AllowReassoc = FMF.allowReassoc();
852   NoNaNs = FMF.noNaNs();
853   NoInfs = FMF.noInfs();
854   NoSignedZeros = FMF.noSignedZeros();
855   AllowReciprocal = FMF.allowReciprocal();
856   AllowContract = FMF.allowContract();
857   ApproxFunc = FMF.approxFunc();
858 }
859 
860 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
861 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
862   switch (OpType) {
863   case OperationType::Cmp:
864     O << " " << CmpInst::getPredicateName(getPredicate());
865     break;
866   case OperationType::DisjointOp:
867     if (DisjointFlags.IsDisjoint)
868       O << " disjoint";
869     break;
870   case OperationType::PossiblyExactOp:
871     if (ExactFlags.IsExact)
872       O << " exact";
873     break;
874   case OperationType::OverflowingBinOp:
875     if (WrapFlags.HasNUW)
876       O << " nuw";
877     if (WrapFlags.HasNSW)
878       O << " nsw";
879     break;
880   case OperationType::FPMathOp:
881     getFastMathFlags().print(O);
882     break;
883   case OperationType::GEPOp:
884     if (GEPFlags.IsInBounds)
885       O << " inbounds";
886     break;
887   case OperationType::NonNegOp:
888     if (NonNegFlags.NonNeg)
889       O << " nneg";
890     break;
891   case OperationType::Other:
892     break;
893   }
894   if (getNumOperands() > 0)
895     O << " ";
896 }
897 #endif
898 
899 void VPWidenRecipe::execute(VPTransformState &State) {
900   State.setDebugLocFrom(getDebugLoc());
901   auto &Builder = State.Builder;
902   switch (Opcode) {
903   case Instruction::Call:
904   case Instruction::Br:
905   case Instruction::PHI:
906   case Instruction::GetElementPtr:
907   case Instruction::Select:
908     llvm_unreachable("This instruction is handled by a different recipe.");
909   case Instruction::UDiv:
910   case Instruction::SDiv:
911   case Instruction::SRem:
912   case Instruction::URem:
913   case Instruction::Add:
914   case Instruction::FAdd:
915   case Instruction::Sub:
916   case Instruction::FSub:
917   case Instruction::FNeg:
918   case Instruction::Mul:
919   case Instruction::FMul:
920   case Instruction::FDiv:
921   case Instruction::FRem:
922   case Instruction::Shl:
923   case Instruction::LShr:
924   case Instruction::AShr:
925   case Instruction::And:
926   case Instruction::Or:
927   case Instruction::Xor: {
928     // Just widen unops and binops.
929     for (unsigned Part = 0; Part < State.UF; ++Part) {
930       SmallVector<Value *, 2> Ops;
931       for (VPValue *VPOp : operands())
932         Ops.push_back(State.get(VPOp, Part));
933 
934       Value *V = Builder.CreateNAryOp(Opcode, Ops);
935 
936       if (auto *VecOp = dyn_cast<Instruction>(V))
937         setFlags(VecOp);
938 
939       // Use this vector value for all users of the original instruction.
940       State.set(this, V, Part);
941       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
942     }
943 
944     break;
945   }
946   case Instruction::Freeze: {
947     for (unsigned Part = 0; Part < State.UF; ++Part) {
948       Value *Op = State.get(getOperand(0), Part);
949 
950       Value *Freeze = Builder.CreateFreeze(Op);
951       State.set(this, Freeze, Part);
952     }
953     break;
954   }
955   case Instruction::ICmp:
956   case Instruction::FCmp: {
957     // Widen compares. Generate vector compares.
958     bool FCmp = Opcode == Instruction::FCmp;
959     for (unsigned Part = 0; Part < State.UF; ++Part) {
960       Value *A = State.get(getOperand(0), Part);
961       Value *B = State.get(getOperand(1), Part);
962       Value *C = nullptr;
963       if (FCmp) {
964         // Propagate fast math flags.
965         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
966         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
967           Builder.setFastMathFlags(I->getFastMathFlags());
968         C = Builder.CreateFCmp(getPredicate(), A, B);
969       } else {
970         C = Builder.CreateICmp(getPredicate(), A, B);
971       }
972       State.set(this, C, Part);
973       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
974     }
975 
976     break;
977   }
978   default:
979     // This instruction is not vectorized by simple widening.
980     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
981                       << Instruction::getOpcodeName(Opcode));
982     llvm_unreachable("Unhandled instruction!");
983   } // end of switch.
984 
985 #if !defined(NDEBUG)
986   // Verify that VPlan type inference results agree with the type of the
987   // generated values.
988   for (unsigned Part = 0; Part < State.UF; ++Part) {
989     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
990                            State.VF) == State.get(this, Part)->getType() &&
991            "inferred type and type from generated instructions do not match");
992   }
993 #endif
994 }
995 
996 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
997 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
998                           VPSlotTracker &SlotTracker) const {
999   O << Indent << "WIDEN ";
1000   printAsOperand(O, SlotTracker);
1001   O << " = " << Instruction::getOpcodeName(Opcode);
1002   printFlags(O);
1003   printOperands(O, SlotTracker);
1004 }
1005 #endif
1006 
1007 void VPWidenCastRecipe::execute(VPTransformState &State) {
1008   State.setDebugLocFrom(getDebugLoc());
1009   auto &Builder = State.Builder;
1010   /// Vectorize casts.
1011   assert(State.VF.isVector() && "Not vectorizing?");
1012   Type *DestTy = VectorType::get(getResultType(), State.VF);
1013   VPValue *Op = getOperand(0);
1014   for (unsigned Part = 0; Part < State.UF; ++Part) {
1015     if (Part > 0 && Op->isLiveIn()) {
1016       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1017       State.set(this, State.get(this, 0), Part);
1018       continue;
1019     }
1020     Value *A = State.get(Op, Part);
1021     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1022     State.set(this, Cast, Part);
1023     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1024   }
1025 }
1026 
1027 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1028 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1029                               VPSlotTracker &SlotTracker) const {
1030   O << Indent << "WIDEN-CAST ";
1031   printAsOperand(O, SlotTracker);
1032   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1033   printFlags(O);
1034   printOperands(O, SlotTracker);
1035   O << " to " << *getResultType();
1036 }
1037 #endif
1038 
1039 /// This function adds
1040 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1041 /// to each vector element of Val. The sequence starts at StartIndex.
1042 /// \p Opcode is relevant for FP induction variable.
1043 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1044                             Instruction::BinaryOps BinOp, ElementCount VF,
1045                             IRBuilderBase &Builder) {
1046   assert(VF.isVector() && "only vector VFs are supported");
1047 
1048   // Create and check the types.
1049   auto *ValVTy = cast<VectorType>(Val->getType());
1050   ElementCount VLen = ValVTy->getElementCount();
1051 
1052   Type *STy = Val->getType()->getScalarType();
1053   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1054          "Induction Step must be an integer or FP");
1055   assert(Step->getType() == STy && "Step has wrong type");
1056 
1057   SmallVector<Constant *, 8> Indices;
1058 
1059   // Create a vector of consecutive numbers from zero to VF.
1060   VectorType *InitVecValVTy = ValVTy;
1061   if (STy->isFloatingPointTy()) {
1062     Type *InitVecValSTy =
1063         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1064     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1065   }
1066   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1067 
1068   // Splat the StartIdx
1069   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1070 
1071   if (STy->isIntegerTy()) {
1072     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1073     Step = Builder.CreateVectorSplat(VLen, Step);
1074     assert(Step->getType() == Val->getType() && "Invalid step vec");
1075     // FIXME: The newly created binary instructions should contain nsw/nuw
1076     // flags, which can be found from the original scalar operations.
1077     Step = Builder.CreateMul(InitVec, Step);
1078     return Builder.CreateAdd(Val, Step, "induction");
1079   }
1080 
1081   // Floating point induction.
1082   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1083          "Binary Opcode should be specified for FP induction");
1084   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1085   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1086 
1087   Step = Builder.CreateVectorSplat(VLen, Step);
1088   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1089   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1090 }
1091 
1092 /// A helper function that returns an integer or floating-point constant with
1093 /// value C.
1094 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1095   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1096                            : ConstantFP::get(Ty, C);
1097 }
1098 
1099 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1100                                   ElementCount VF) {
1101   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1102   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1103   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1104   return B.CreateUIToFP(RuntimeVF, FTy);
1105 }
1106 
1107 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1108   assert(!State.Instance && "Int or FP induction being replicated.");
1109 
1110   Value *Start = getStartValue()->getLiveInIRValue();
1111   const InductionDescriptor &ID = getInductionDescriptor();
1112   TruncInst *Trunc = getTruncInst();
1113   IRBuilderBase &Builder = State.Builder;
1114   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1115   assert(State.VF.isVector() && "must have vector VF");
1116 
1117   // The value from the original loop to which we are mapping the new induction
1118   // variable.
1119   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1120 
1121   // Fast-math-flags propagate from the original induction instruction.
1122   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1123   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1124     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1125 
1126   // Now do the actual transformations, and start with fetching the step value.
1127   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1128 
1129   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1130          "Expected either an induction phi-node or a truncate of it!");
1131 
1132   // Construct the initial value of the vector IV in the vector loop preheader
1133   auto CurrIP = Builder.saveIP();
1134   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1135   Builder.SetInsertPoint(VectorPH->getTerminator());
1136   if (isa<TruncInst>(EntryVal)) {
1137     assert(Start->getType()->isIntegerTy() &&
1138            "Truncation requires an integer type");
1139     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1140     Step = Builder.CreateTrunc(Step, TruncType);
1141     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1142   }
1143 
1144   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1145   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1146   Value *SteppedStart = getStepVector(
1147       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1148 
1149   // We create vector phi nodes for both integer and floating-point induction
1150   // variables. Here, we determine the kind of arithmetic we will perform.
1151   Instruction::BinaryOps AddOp;
1152   Instruction::BinaryOps MulOp;
1153   if (Step->getType()->isIntegerTy()) {
1154     AddOp = Instruction::Add;
1155     MulOp = Instruction::Mul;
1156   } else {
1157     AddOp = ID.getInductionOpcode();
1158     MulOp = Instruction::FMul;
1159   }
1160 
1161   // Multiply the vectorization factor by the step using integer or
1162   // floating-point arithmetic as appropriate.
1163   Type *StepType = Step->getType();
1164   Value *RuntimeVF;
1165   if (Step->getType()->isFloatingPointTy())
1166     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1167   else
1168     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1169   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1170 
1171   // Create a vector splat to use in the induction update.
1172   //
1173   // FIXME: If the step is non-constant, we create the vector splat with
1174   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1175   //        handle a constant vector splat.
1176   Value *SplatVF = isa<Constant>(Mul)
1177                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1178                        : Builder.CreateVectorSplat(State.VF, Mul);
1179   Builder.restoreIP(CurrIP);
1180 
1181   // We may need to add the step a number of times, depending on the unroll
1182   // factor. The last of those goes into the PHI.
1183   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1184   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1185   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1186   Instruction *LastInduction = VecInd;
1187   for (unsigned Part = 0; Part < State.UF; ++Part) {
1188     State.set(this, LastInduction, Part);
1189 
1190     if (isa<TruncInst>(EntryVal))
1191       State.addMetadata(LastInduction, EntryVal);
1192 
1193     LastInduction = cast<Instruction>(
1194         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1195     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1196   }
1197 
1198   LastInduction->setName("vec.ind.next");
1199   VecInd->addIncoming(SteppedStart, VectorPH);
1200   // Add induction update using an incorrect block temporarily. The phi node
1201   // will be fixed after VPlan execution. Note that at this point the latch
1202   // block cannot be used, as it does not exist yet.
1203   // TODO: Model increment value in VPlan, by turning the recipe into a
1204   // multi-def and a subclass of VPHeaderPHIRecipe.
1205   VecInd->addIncoming(LastInduction, VectorPH);
1206 }
1207 
1208 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1209 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1210                                           VPSlotTracker &SlotTracker) const {
1211   O << Indent << "WIDEN-INDUCTION";
1212   if (getTruncInst()) {
1213     O << "\\l\"";
1214     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1215     O << " +\n" << Indent << "\"  ";
1216     getVPValue(0)->printAsOperand(O, SlotTracker);
1217   } else
1218     O << " " << VPlanIngredient(IV);
1219 
1220   O << ", ";
1221   getStepValue()->printAsOperand(O, SlotTracker);
1222 }
1223 #endif
1224 
1225 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1226   // The step may be defined by a recipe in the preheader (e.g. if it requires
1227   // SCEV expansion), but for the canonical induction the step is required to be
1228   // 1, which is represented as live-in.
1229   if (getStepValue()->getDefiningRecipe())
1230     return false;
1231   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1232   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1233   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1234   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1235          getScalarType() == CanIV->getScalarType();
1236 }
1237 
1238 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1239 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1240                               VPSlotTracker &SlotTracker) const {
1241   O << Indent;
1242   printAsOperand(O, SlotTracker);
1243   O << Indent << "= DERIVED-IV ";
1244   getStartValue()->printAsOperand(O, SlotTracker);
1245   O << " + ";
1246   getOperand(1)->printAsOperand(O, SlotTracker);
1247   O << " * ";
1248   getStepValue()->printAsOperand(O, SlotTracker);
1249 }
1250 #endif
1251 
1252 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1253   // Fast-math-flags propagate from the original induction instruction.
1254   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1255   if (hasFastMathFlags())
1256     State.Builder.setFastMathFlags(getFastMathFlags());
1257 
1258   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1259   /// variable on which to base the steps, \p Step is the size of the step.
1260 
1261   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1262   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1263   IRBuilderBase &Builder = State.Builder;
1264 
1265   // Ensure step has the same type as that of scalar IV.
1266   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1267   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1268 
1269   // We build scalar steps for both integer and floating-point induction
1270   // variables. Here, we determine the kind of arithmetic we will perform.
1271   Instruction::BinaryOps AddOp;
1272   Instruction::BinaryOps MulOp;
1273   if (BaseIVTy->isIntegerTy()) {
1274     AddOp = Instruction::Add;
1275     MulOp = Instruction::Mul;
1276   } else {
1277     AddOp = InductionOpcode;
1278     MulOp = Instruction::FMul;
1279   }
1280 
1281   // Determine the number of scalars we need to generate for each unroll
1282   // iteration.
1283   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1284   // Compute the scalar steps and save the results in State.
1285   Type *IntStepTy =
1286       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1287   Type *VecIVTy = nullptr;
1288   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1289   if (!FirstLaneOnly && State.VF.isScalable()) {
1290     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1291     UnitStepVec =
1292         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1293     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1294     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1295   }
1296 
1297   unsigned StartPart = 0;
1298   unsigned EndPart = State.UF;
1299   unsigned StartLane = 0;
1300   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1301   if (State.Instance) {
1302     StartPart = State.Instance->Part;
1303     EndPart = StartPart + 1;
1304     StartLane = State.Instance->Lane.getKnownLane();
1305     EndLane = StartLane + 1;
1306   }
1307   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1308     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1309 
1310     if (!FirstLaneOnly && State.VF.isScalable()) {
1311       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1312       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1313       if (BaseIVTy->isFloatingPointTy())
1314         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1315       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1316       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1317       State.set(this, Add, Part);
1318       // It's useful to record the lane values too for the known minimum number
1319       // of elements so we do those below. This improves the code quality when
1320       // trying to extract the first element, for example.
1321     }
1322 
1323     if (BaseIVTy->isFloatingPointTy())
1324       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1325 
1326     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1327       Value *StartIdx = Builder.CreateBinOp(
1328           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1329       // The step returned by `createStepForVF` is a runtime-evaluated value
1330       // when VF is scalable. Otherwise, it should be folded into a Constant.
1331       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1332              "Expected StartIdx to be folded to a constant when VF is not "
1333              "scalable");
1334       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1335       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1336       State.set(this, Add, VPIteration(Part, Lane));
1337     }
1338   }
1339 }
1340 
1341 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1342 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1343                                   VPSlotTracker &SlotTracker) const {
1344   O << Indent;
1345   printAsOperand(O, SlotTracker);
1346   O << " = SCALAR-STEPS ";
1347   printOperands(O, SlotTracker);
1348 }
1349 #endif
1350 
1351 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1352   assert(State.VF.isVector() && "not widening");
1353   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1354   // Construct a vector GEP by widening the operands of the scalar GEP as
1355   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1356   // results in a vector of pointers when at least one operand of the GEP
1357   // is vector-typed. Thus, to keep the representation compact, we only use
1358   // vector-typed operands for loop-varying values.
1359 
1360   if (areAllOperandsInvariant()) {
1361     // If we are vectorizing, but the GEP has only loop-invariant operands,
1362     // the GEP we build (by only using vector-typed operands for
1363     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1364     // produce a vector of pointers, we need to either arbitrarily pick an
1365     // operand to broadcast, or broadcast a clone of the original GEP.
1366     // Here, we broadcast a clone of the original.
1367     //
1368     // TODO: If at some point we decide to scalarize instructions having
1369     //       loop-invariant operands, this special case will no longer be
1370     //       required. We would add the scalarization decision to
1371     //       collectLoopScalars() and teach getVectorValue() to broadcast
1372     //       the lane-zero scalar value.
1373     SmallVector<Value *> Ops;
1374     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1375       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1376 
1377     auto *NewGEP =
1378         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1379                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1380     for (unsigned Part = 0; Part < State.UF; ++Part) {
1381       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1382       State.set(this, EntryPart, Part);
1383       State.addMetadata(EntryPart, GEP);
1384     }
1385   } else {
1386     // If the GEP has at least one loop-varying operand, we are sure to
1387     // produce a vector of pointers. But if we are only unrolling, we want
1388     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1389     // produce with the code below will be scalar (if VF == 1) or vector
1390     // (otherwise). Note that for the unroll-only case, we still maintain
1391     // values in the vector mapping with initVector, as we do for other
1392     // instructions.
1393     for (unsigned Part = 0; Part < State.UF; ++Part) {
1394       // The pointer operand of the new GEP. If it's loop-invariant, we
1395       // won't broadcast it.
1396       auto *Ptr = isPointerLoopInvariant()
1397                       ? State.get(getOperand(0), VPIteration(0, 0))
1398                       : State.get(getOperand(0), Part);
1399 
1400       // Collect all the indices for the new GEP. If any index is
1401       // loop-invariant, we won't broadcast it.
1402       SmallVector<Value *, 4> Indices;
1403       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1404         VPValue *Operand = getOperand(I);
1405         if (isIndexLoopInvariant(I - 1))
1406           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1407         else
1408           Indices.push_back(State.get(Operand, Part));
1409       }
1410 
1411       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1412       // but it should be a vector, otherwise.
1413       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1414                                              Indices, "", isInBounds());
1415       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1416              "NewGEP is not a pointer vector");
1417       State.set(this, NewGEP, Part);
1418       State.addMetadata(NewGEP, GEP);
1419     }
1420   }
1421 }
1422 
1423 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1424 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1425                              VPSlotTracker &SlotTracker) const {
1426   O << Indent << "WIDEN-GEP ";
1427   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1428   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1429     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1430 
1431   O << " ";
1432   printAsOperand(O, SlotTracker);
1433   O << " = getelementptr";
1434   printFlags(O);
1435   printOperands(O, SlotTracker);
1436 }
1437 #endif
1438 
1439 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1440   auto &Builder = State.Builder;
1441   State.setDebugLocFrom(getDebugLoc());
1442   for (unsigned Part = 0; Part < State.UF; ++Part) {
1443     // Calculate the pointer for the specific unroll-part.
1444     Value *PartPtr = nullptr;
1445     // Use i32 for the gep index type when the value is constant,
1446     // or query DataLayout for a more suitable index type otherwise.
1447     const DataLayout &DL =
1448         Builder.GetInsertBlock()->getModule()->getDataLayout();
1449     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1450                         ? DL.getIndexType(IndexedTy->getPointerTo())
1451                         : Builder.getInt32Ty();
1452     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1453     bool InBounds = isInBounds();
1454     if (IsReverse) {
1455       // If the address is consecutive but reversed, then the
1456       // wide store needs to start at the last vector element.
1457       // RunTimeVF =  VScale * VF.getKnownMinValue()
1458       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1459       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1460       // NumElt = -Part * RunTimeVF
1461       Value *NumElt = Builder.CreateMul(
1462           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1463       // LastLane = 1 - RunTimeVF
1464       Value *LastLane =
1465           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1466       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1467       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1468     } else {
1469       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1470       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1471     }
1472 
1473     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1474   }
1475 }
1476 
1477 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1478 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1479                                   VPSlotTracker &SlotTracker) const {
1480   O << Indent;
1481   printAsOperand(O, SlotTracker);
1482   O << " = vector-pointer ";
1483   if (IsReverse)
1484     O << "(reverse) ";
1485 
1486   printOperands(O, SlotTracker);
1487 }
1488 #endif
1489 
1490 void VPBlendRecipe::execute(VPTransformState &State) {
1491   State.setDebugLocFrom(getDebugLoc());
1492   // We know that all PHIs in non-header blocks are converted into
1493   // selects, so we don't have to worry about the insertion order and we
1494   // can just use the builder.
1495   // At this point we generate the predication tree. There may be
1496   // duplications since this is a simple recursive scan, but future
1497   // optimizations will clean it up.
1498 
1499   unsigned NumIncoming = getNumIncomingValues();
1500 
1501   // Generate a sequence of selects of the form:
1502   // SELECT(Mask3, In3,
1503   //        SELECT(Mask2, In2,
1504   //               SELECT(Mask1, In1,
1505   //                      In0)))
1506   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1507   // are essentially undef are taken from In0.
1508  VectorParts Entry(State.UF);
1509   for (unsigned In = 0; In < NumIncoming; ++In) {
1510     for (unsigned Part = 0; Part < State.UF; ++Part) {
1511       // We might have single edge PHIs (blocks) - use an identity
1512       // 'select' for the first PHI operand.
1513       Value *In0 = State.get(getIncomingValue(In), Part);
1514       if (In == 0)
1515         Entry[Part] = In0; // Initialize with the first incoming value.
1516       else {
1517         // Select between the current value and the previous incoming edge
1518         // based on the incoming mask.
1519         Value *Cond = State.get(getMask(In), Part);
1520         Entry[Part] =
1521             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1522       }
1523     }
1524   }
1525   for (unsigned Part = 0; Part < State.UF; ++Part)
1526     State.set(this, Entry[Part], Part);
1527 }
1528 
1529 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1530 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1531                           VPSlotTracker &SlotTracker) const {
1532   O << Indent << "BLEND ";
1533   printAsOperand(O, SlotTracker);
1534   O << " =";
1535   if (getNumIncomingValues() == 1) {
1536     // Not a User of any mask: not really blending, this is a
1537     // single-predecessor phi.
1538     O << " ";
1539     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1540   } else {
1541     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1542       O << " ";
1543       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1544       if (I == 0)
1545         continue;
1546       O << "/";
1547       getMask(I)->printAsOperand(O, SlotTracker);
1548     }
1549   }
1550 }
1551 #endif
1552 
1553 void VPReductionRecipe::execute(VPTransformState &State) {
1554   assert(!State.Instance && "Reduction being replicated.");
1555   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1556   RecurKind Kind = RdxDesc.getRecurrenceKind();
1557   // Propagate the fast-math flags carried by the underlying instruction.
1558   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1559   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1560   for (unsigned Part = 0; Part < State.UF; ++Part) {
1561     Value *NewVecOp = State.get(getVecOp(), Part);
1562     if (VPValue *Cond = getCondOp()) {
1563       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1564       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1565       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1566       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1567                                                   RdxDesc.getFastMathFlags());
1568       if (State.VF.isVector()) {
1569         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1570       }
1571 
1572       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1573       NewVecOp = Select;
1574     }
1575     Value *NewRed;
1576     Value *NextInChain;
1577     if (IsOrdered) {
1578       if (State.VF.isVector())
1579         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1580                                         PrevInChain);
1581       else
1582         NewRed = State.Builder.CreateBinOp(
1583             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1584             NewVecOp);
1585       PrevInChain = NewRed;
1586     } else {
1587       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1588       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1589     }
1590     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1591       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1592                                    NewRed, PrevInChain);
1593     } else if (IsOrdered)
1594       NextInChain = NewRed;
1595     else
1596       NextInChain = State.Builder.CreateBinOp(
1597           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1598     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1599   }
1600 }
1601 
1602 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1603 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1604                               VPSlotTracker &SlotTracker) const {
1605   O << Indent << "REDUCE ";
1606   printAsOperand(O, SlotTracker);
1607   O << " = ";
1608   getChainOp()->printAsOperand(O, SlotTracker);
1609   O << " +";
1610   if (isa<FPMathOperator>(getUnderlyingInstr()))
1611     O << getUnderlyingInstr()->getFastMathFlags();
1612   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1613   getVecOp()->printAsOperand(O, SlotTracker);
1614   if (getCondOp()) {
1615     O << ", ";
1616     getCondOp()->printAsOperand(O, SlotTracker);
1617   }
1618   O << ")";
1619   if (RdxDesc.IntermediateStore)
1620     O << " (with final reduction value stored in invariant address sank "
1621          "outside of loop)";
1622 }
1623 #endif
1624 
1625 bool VPReplicateRecipe::shouldPack() const {
1626   // Find if the recipe is used by a widened recipe via an intervening
1627   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1628   return any_of(users(), [](const VPUser *U) {
1629     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1630       return any_of(PredR->users(), [PredR](const VPUser *U) {
1631         return !U->usesScalars(PredR);
1632       });
1633     return false;
1634   });
1635 }
1636 
1637 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1638 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1639                               VPSlotTracker &SlotTracker) const {
1640   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1641 
1642   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1643     printAsOperand(O, SlotTracker);
1644     O << " = ";
1645   }
1646   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1647     O << "call";
1648     printFlags(O);
1649     O << "@" << CB->getCalledFunction()->getName() << "(";
1650     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1651                     O, [&O, &SlotTracker](VPValue *Op) {
1652                       Op->printAsOperand(O, SlotTracker);
1653                     });
1654     O << ")";
1655   } else {
1656     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1657     printFlags(O);
1658     printOperands(O, SlotTracker);
1659   }
1660 
1661   if (shouldPack())
1662     O << " (S->V)";
1663 }
1664 #endif
1665 
1666 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1667 /// if it is either defined outside the vector region or its operand is known to
1668 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1669 /// TODO: Uniformity should be associated with a VPValue and there should be a
1670 /// generic way to check.
1671 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1672   return C->isDefinedOutsideVectorRegions() ||
1673          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1674          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1675 }
1676 
1677 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1678   assert(vputils::onlyFirstLaneUsed(this) &&
1679          "Codegen only implemented for first lane.");
1680   switch (Opcode) {
1681   case Instruction::SExt:
1682   case Instruction::ZExt:
1683   case Instruction::Trunc: {
1684     // Note: SExt/ZExt not used yet.
1685     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1686     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1687   }
1688   default:
1689     llvm_unreachable("opcode not implemented yet");
1690   }
1691 }
1692 
1693 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1694   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1695   for (unsigned Part = 0; Part != State.UF; ++Part) {
1696     Value *Res;
1697     // Only generate a single instance, if the recipe is uniform across UFs and
1698     // VFs.
1699     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1700       Res = State.get(this, VPIteration(0, 0));
1701     else
1702       Res = generate(State, Part);
1703     State.set(this, Res, VPIteration(Part, 0));
1704   }
1705 }
1706 
1707 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1708 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1709                                 VPSlotTracker &SlotTracker) const {
1710   O << Indent << "SCALAR-CAST ";
1711   printAsOperand(O, SlotTracker);
1712   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1713   printOperands(O, SlotTracker);
1714   O << " to " << *ResultTy;
1715 }
1716 #endif
1717 
1718 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1719   assert(State.Instance && "Branch on Mask works only on single instance.");
1720 
1721   unsigned Part = State.Instance->Part;
1722   unsigned Lane = State.Instance->Lane.getKnownLane();
1723 
1724   Value *ConditionBit = nullptr;
1725   VPValue *BlockInMask = getMask();
1726   if (BlockInMask) {
1727     ConditionBit = State.get(BlockInMask, Part);
1728     if (ConditionBit->getType()->isVectorTy())
1729       ConditionBit = State.Builder.CreateExtractElement(
1730           ConditionBit, State.Builder.getInt32(Lane));
1731   } else // Block in mask is all-one.
1732     ConditionBit = State.Builder.getTrue();
1733 
1734   // Replace the temporary unreachable terminator with a new conditional branch,
1735   // whose two destinations will be set later when they are created.
1736   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1737   assert(isa<UnreachableInst>(CurrentTerminator) &&
1738          "Expected to replace unreachable terminator with conditional branch.");
1739   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1740   CondBr->setSuccessor(0, nullptr);
1741   ReplaceInstWithInst(CurrentTerminator, CondBr);
1742 }
1743 
1744 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1745   assert(State.Instance && "Predicated instruction PHI works per instance.");
1746   Instruction *ScalarPredInst =
1747       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1748   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1749   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1750   assert(PredicatingBB && "Predicated block has no single predecessor.");
1751   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1752          "operand must be VPReplicateRecipe");
1753 
1754   // By current pack/unpack logic we need to generate only a single phi node: if
1755   // a vector value for the predicated instruction exists at this point it means
1756   // the instruction has vector users only, and a phi for the vector value is
1757   // needed. In this case the recipe of the predicated instruction is marked to
1758   // also do that packing, thereby "hoisting" the insert-element sequence.
1759   // Otherwise, a phi node for the scalar value is needed.
1760   unsigned Part = State.Instance->Part;
1761   if (State.hasVectorValue(getOperand(0), Part)) {
1762     Value *VectorValue = State.get(getOperand(0), Part);
1763     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1764     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1765     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1766     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1767     if (State.hasVectorValue(this, Part))
1768       State.reset(this, VPhi, Part);
1769     else
1770       State.set(this, VPhi, Part);
1771     // NOTE: Currently we need to update the value of the operand, so the next
1772     // predicated iteration inserts its generated value in the correct vector.
1773     State.reset(getOperand(0), VPhi, Part);
1774   } else {
1775     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1776     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1777     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1778                      PredicatingBB);
1779     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1780     if (State.hasScalarValue(this, *State.Instance))
1781       State.reset(this, Phi, *State.Instance);
1782     else
1783       State.set(this, Phi, *State.Instance);
1784     // NOTE: Currently we need to update the value of the operand, so the next
1785     // predicated iteration inserts its generated value in the correct vector.
1786     State.reset(getOperand(0), Phi, *State.Instance);
1787   }
1788 }
1789 
1790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1791 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1792                                 VPSlotTracker &SlotTracker) const {
1793   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1794   printAsOperand(O, SlotTracker);
1795   O << " = ";
1796   printOperands(O, SlotTracker);
1797 }
1798 
1799 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1800                               VPSlotTracker &SlotTracker) const {
1801   O << Indent << "WIDEN ";
1802   printAsOperand(O, SlotTracker);
1803   O << " = load ";
1804   printOperands(O, SlotTracker);
1805 }
1806 
1807 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1808                                  VPSlotTracker &SlotTracker) const {
1809   O << Indent << "WIDEN ";
1810   printAsOperand(O, SlotTracker);
1811   O << " = vp.load ";
1812   printOperands(O, SlotTracker);
1813 }
1814 
1815 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1816                                VPSlotTracker &SlotTracker) const {
1817   O << Indent << "WIDEN store ";
1818   printOperands(O, SlotTracker);
1819 }
1820 
1821 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1822                                   VPSlotTracker &SlotTracker) const {
1823   O << Indent << "WIDEN vp.store ";
1824   printOperands(O, SlotTracker);
1825 }
1826 #endif
1827 
1828 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1829   Value *Start = getStartValue()->getLiveInIRValue();
1830   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1831   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1832 
1833   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1834   EntryPart->addIncoming(Start, VectorPH);
1835   EntryPart->setDebugLoc(getDebugLoc());
1836   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1837     State.set(this, EntryPart, Part, /*IsScalar*/ true);
1838 }
1839 
1840 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1841 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1842                                    VPSlotTracker &SlotTracker) const {
1843   O << Indent << "EMIT ";
1844   printAsOperand(O, SlotTracker);
1845   O << " = CANONICAL-INDUCTION ";
1846   printOperands(O, SlotTracker);
1847 }
1848 #endif
1849 
1850 bool VPCanonicalIVPHIRecipe::isCanonical(
1851     InductionDescriptor::InductionKind Kind, VPValue *Start,
1852     VPValue *Step) const {
1853   // Must be an integer induction.
1854   if (Kind != InductionDescriptor::IK_IntInduction)
1855     return false;
1856   // Start must match the start value of this canonical induction.
1857   if (Start != getStartValue())
1858     return false;
1859 
1860   // If the step is defined by a recipe, it is not a ConstantInt.
1861   if (Step->getDefiningRecipe())
1862     return false;
1863 
1864   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1865   return StepC && StepC->isOne();
1866 }
1867 
1868 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1869   return IsScalarAfterVectorization &&
1870          (!IsScalable || vputils::onlyFirstLaneUsed(this));
1871 }
1872 
1873 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1874 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1875                                           VPSlotTracker &SlotTracker) const {
1876   O << Indent << "EMIT ";
1877   printAsOperand(O, SlotTracker);
1878   O << " = WIDEN-POINTER-INDUCTION ";
1879   getStartValue()->printAsOperand(O, SlotTracker);
1880   O << ", " << *IndDesc.getStep();
1881 }
1882 #endif
1883 
1884 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1885   assert(!State.Instance && "cannot be used in per-lane");
1886   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1887   SCEVExpander Exp(SE, DL, "induction");
1888 
1889   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1890                                  &*State.Builder.GetInsertPoint());
1891   assert(!State.ExpandedSCEVs.contains(Expr) &&
1892          "Same SCEV expanded multiple times");
1893   State.ExpandedSCEVs[Expr] = Res;
1894   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1895     State.set(this, Res, {Part, 0});
1896 }
1897 
1898 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1899 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1900                                VPSlotTracker &SlotTracker) const {
1901   O << Indent << "EMIT ";
1902   getVPSingleValue()->printAsOperand(O, SlotTracker);
1903   O << " = EXPAND SCEV " << *Expr;
1904 }
1905 #endif
1906 
1907 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1908   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
1909   Type *STy = CanonicalIV->getType();
1910   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1911   ElementCount VF = State.VF;
1912   Value *VStart = VF.isScalar()
1913                       ? CanonicalIV
1914                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1915   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1916     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1917     if (VF.isVector()) {
1918       VStep = Builder.CreateVectorSplat(VF, VStep);
1919       VStep =
1920           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1921     }
1922     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1923     State.set(this, CanonicalVectorIV, Part);
1924   }
1925 }
1926 
1927 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1928 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1929                                      VPSlotTracker &SlotTracker) const {
1930   O << Indent << "EMIT ";
1931   printAsOperand(O, SlotTracker);
1932   O << " = WIDEN-CANONICAL-INDUCTION ";
1933   printOperands(O, SlotTracker);
1934 }
1935 #endif
1936 
1937 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1938   auto &Builder = State.Builder;
1939   // Create a vector from the initial value.
1940   auto *VectorInit = getStartValue()->getLiveInIRValue();
1941 
1942   Type *VecTy = State.VF.isScalar()
1943                     ? VectorInit->getType()
1944                     : VectorType::get(VectorInit->getType(), State.VF);
1945 
1946   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1947   if (State.VF.isVector()) {
1948     auto *IdxTy = Builder.getInt32Ty();
1949     auto *One = ConstantInt::get(IdxTy, 1);
1950     IRBuilder<>::InsertPointGuard Guard(Builder);
1951     Builder.SetInsertPoint(VectorPH->getTerminator());
1952     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1953     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1954     VectorInit = Builder.CreateInsertElement(
1955         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1956   }
1957 
1958   // Create a phi node for the new recurrence.
1959   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1960   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1961   EntryPart->addIncoming(VectorInit, VectorPH);
1962   State.set(this, EntryPart, 0);
1963 }
1964 
1965 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1966 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1967                                             VPSlotTracker &SlotTracker) const {
1968   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1969   printAsOperand(O, SlotTracker);
1970   O << " = phi ";
1971   printOperands(O, SlotTracker);
1972 }
1973 #endif
1974 
1975 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1976   auto &Builder = State.Builder;
1977 
1978   // Reductions do not have to start at zero. They can start with
1979   // any loop invariant values.
1980   VPValue *StartVPV = getStartValue();
1981   Value *StartV = StartVPV->getLiveInIRValue();
1982 
1983   // In order to support recurrences we need to be able to vectorize Phi nodes.
1984   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1985   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1986   // this value when we vectorize all of the instructions that use the PHI.
1987   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1988   Type *VecTy = ScalarPHI ? StartV->getType()
1989                           : VectorType::get(StartV->getType(), State.VF);
1990 
1991   BasicBlock *HeaderBB = State.CFG.PrevBB;
1992   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1993          "recipe must be in the vector loop header");
1994   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1995   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1996     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1997     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1998     State.set(this, EntryPart, Part, IsInLoop);
1999   }
2000 
2001   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2002 
2003   Value *Iden = nullptr;
2004   RecurKind RK = RdxDesc.getRecurrenceKind();
2005   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2006       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2007     // MinMax and AnyOf reductions have the start value as their identity.
2008     if (ScalarPHI) {
2009       Iden = StartV;
2010     } else {
2011       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2012       Builder.SetInsertPoint(VectorPH->getTerminator());
2013       StartV = Iden =
2014           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2015     }
2016   } else {
2017     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2018                                          RdxDesc.getFastMathFlags());
2019 
2020     if (!ScalarPHI) {
2021       Iden = Builder.CreateVectorSplat(State.VF, Iden);
2022       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2023       Builder.SetInsertPoint(VectorPH->getTerminator());
2024       Constant *Zero = Builder.getInt32(0);
2025       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2026     }
2027   }
2028 
2029   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2030     Value *EntryPart = State.get(this, Part, IsInLoop);
2031     // Make sure to add the reduction start value only to the
2032     // first unroll part.
2033     Value *StartVal = (Part == 0) ? StartV : Iden;
2034     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2035   }
2036 }
2037 
2038 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2039 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2040                                  VPSlotTracker &SlotTracker) const {
2041   O << Indent << "WIDEN-REDUCTION-PHI ";
2042 
2043   printAsOperand(O, SlotTracker);
2044   O << " = phi ";
2045   printOperands(O, SlotTracker);
2046 }
2047 #endif
2048 
2049 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2050   assert(EnableVPlanNativePath &&
2051          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2052 
2053   Value *Op0 = State.get(getOperand(0), 0);
2054   Type *VecTy = Op0->getType();
2055   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2056   State.set(this, VecPhi, 0);
2057 }
2058 
2059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2060 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2061                              VPSlotTracker &SlotTracker) const {
2062   O << Indent << "WIDEN-PHI ";
2063 
2064   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2065   // Unless all incoming values are modeled in VPlan  print the original PHI
2066   // directly.
2067   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2068   // values as VPValues.
2069   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2070     O << VPlanIngredient(OriginalPhi);
2071     return;
2072   }
2073 
2074   printAsOperand(O, SlotTracker);
2075   O << " = phi ";
2076   printOperands(O, SlotTracker);
2077 }
2078 #endif
2079 
2080 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2081 // remove VPActiveLaneMaskPHIRecipe.
2082 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2083   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2084   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2085     Value *StartMask = State.get(getOperand(0), Part);
2086     PHINode *EntryPart =
2087         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2088     EntryPart->addIncoming(StartMask, VectorPH);
2089     EntryPart->setDebugLoc(getDebugLoc());
2090     State.set(this, EntryPart, Part);
2091   }
2092 }
2093 
2094 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2095 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2096                                       VPSlotTracker &SlotTracker) const {
2097   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2098 
2099   printAsOperand(O, SlotTracker);
2100   O << " = phi ";
2101   printOperands(O, SlotTracker);
2102 }
2103 #endif
2104 
2105 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2106   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2107   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2108   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2109   PHINode *EntryPart =
2110       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2111   EntryPart->addIncoming(Start, VectorPH);
2112   EntryPart->setDebugLoc(getDebugLoc());
2113   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2114 }
2115 
2116 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2117 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2118                                   VPSlotTracker &SlotTracker) const {
2119   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2120 
2121   printAsOperand(O, SlotTracker);
2122   O << " = phi ";
2123   printOperands(O, SlotTracker);
2124 }
2125 #endif
2126