xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 07b330132c0b7fd5b3ada1890840d0dbbd8cdb8e)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenStoreEVLSC:
51   case VPWidenStoreSC:
52     return true;
53   case VPReplicateSC:
54     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
55         ->mayWriteToMemory();
56   case VPWidenCallSC:
57     return !cast<VPWidenCallRecipe>(this)
58                 ->getCalledScalarFunction()
59                 ->onlyReadsMemory();
60   case VPBranchOnMaskSC:
61   case VPScalarIVStepsSC:
62   case VPPredInstPHISC:
63     return false;
64   case VPBlendSC:
65   case VPReductionSC:
66   case VPWidenCanonicalIVSC:
67   case VPWidenCastSC:
68   case VPWidenGEPSC:
69   case VPWidenIntOrFpInductionSC:
70   case VPWidenLoadEVLSC:
71   case VPWidenLoadSC:
72   case VPWidenPHISC:
73   case VPWidenSC:
74   case VPWidenSelectSC: {
75     const Instruction *I =
76         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
77     (void)I;
78     assert((!I || !I->mayWriteToMemory()) &&
79            "underlying instruction may write to memory");
80     return false;
81   }
82   default:
83     return true;
84   }
85 }
86 
87 bool VPRecipeBase::mayReadFromMemory() const {
88   switch (getVPDefID()) {
89   case VPWidenLoadEVLSC:
90   case VPWidenLoadSC:
91     return true;
92   case VPReplicateSC:
93     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
94         ->mayReadFromMemory();
95   case VPWidenCallSC:
96     return !cast<VPWidenCallRecipe>(this)
97                 ->getCalledScalarFunction()
98                 ->onlyWritesMemory();
99   case VPBranchOnMaskSC:
100   case VPPredInstPHISC:
101   case VPScalarIVStepsSC:
102   case VPWidenStoreEVLSC:
103   case VPWidenStoreSC:
104     return false;
105   case VPBlendSC:
106   case VPReductionSC:
107   case VPWidenCanonicalIVSC:
108   case VPWidenCastSC:
109   case VPWidenGEPSC:
110   case VPWidenIntOrFpInductionSC:
111   case VPWidenPHISC:
112   case VPWidenSC:
113   case VPWidenSelectSC: {
114     const Instruction *I =
115         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
116     (void)I;
117     assert((!I || !I->mayReadFromMemory()) &&
118            "underlying instruction may read from memory");
119     return false;
120   }
121   default:
122     return true;
123   }
124 }
125 
126 bool VPRecipeBase::mayHaveSideEffects() const {
127   switch (getVPDefID()) {
128   case VPDerivedIVSC:
129   case VPPredInstPHISC:
130   case VPScalarCastSC:
131     return false;
132   case VPInstructionSC:
133     switch (cast<VPInstruction>(this)->getOpcode()) {
134     case Instruction::Or:
135     case Instruction::ICmp:
136     case Instruction::Select:
137     case VPInstruction::Not:
138     case VPInstruction::CalculateTripCountMinusVF:
139     case VPInstruction::CanonicalIVIncrementForPart:
140     case VPInstruction::ExtractFromEnd:
141     case VPInstruction::LogicalAnd:
142     case VPInstruction::PtrAdd:
143       return false;
144     default:
145       return true;
146     }
147   case VPWidenCallSC: {
148     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
149     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
150   }
151   case VPBlendSC:
152   case VPReductionSC:
153   case VPScalarIVStepsSC:
154   case VPWidenCanonicalIVSC:
155   case VPWidenCastSC:
156   case VPWidenGEPSC:
157   case VPWidenIntOrFpInductionSC:
158   case VPWidenPHISC:
159   case VPWidenPointerInductionSC:
160   case VPWidenSC:
161   case VPWidenSelectSC: {
162     const Instruction *I =
163         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
164     (void)I;
165     assert((!I || !I->mayHaveSideEffects()) &&
166            "underlying instruction has side-effects");
167     return false;
168   }
169   case VPInterleaveSC:
170     return mayWriteToMemory();
171   case VPWidenLoadEVLSC:
172   case VPWidenLoadSC:
173   case VPWidenStoreEVLSC:
174   case VPWidenStoreSC:
175     assert(
176         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
177             mayWriteToMemory() &&
178         "mayHaveSideffects result for ingredient differs from this "
179         "implementation");
180     return mayWriteToMemory();
181   case VPReplicateSC: {
182     auto *R = cast<VPReplicateRecipe>(this);
183     return R->getUnderlyingInstr()->mayHaveSideEffects();
184   }
185   default:
186     return true;
187   }
188 }
189 
190 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
191   auto Lane = VPLane::getLastLaneForVF(State.VF);
192   VPValue *ExitValue = getOperand(0);
193   if (vputils::isUniformAfterVectorization(ExitValue))
194     Lane = VPLane::getFirstLane();
195   VPBasicBlock *MiddleVPBB =
196       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
197   assert(MiddleVPBB->getNumSuccessors() == 0 &&
198          "the middle block must not have any successors");
199   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
200   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
201                    MiddleBB);
202 }
203 
204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
205 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
206   O << "Live-out ";
207   getPhi()->printAsOperand(O);
208   O << " = ";
209   getOperand(0)->printAsOperand(O, SlotTracker);
210   O << "\n";
211 }
212 #endif
213 
214 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
215   assert(!Parent && "Recipe already in some VPBasicBlock");
216   assert(InsertPos->getParent() &&
217          "Insertion position not in any VPBasicBlock");
218   InsertPos->getParent()->insert(this, InsertPos->getIterator());
219 }
220 
221 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
222                                 iplist<VPRecipeBase>::iterator I) {
223   assert(!Parent && "Recipe already in some VPBasicBlock");
224   assert(I == BB.end() || I->getParent() == &BB);
225   BB.insert(this, I);
226 }
227 
228 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
229   assert(!Parent && "Recipe already in some VPBasicBlock");
230   assert(InsertPos->getParent() &&
231          "Insertion position not in any VPBasicBlock");
232   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
233 }
234 
235 void VPRecipeBase::removeFromParent() {
236   assert(getParent() && "Recipe not in any VPBasicBlock");
237   getParent()->getRecipeList().remove(getIterator());
238   Parent = nullptr;
239 }
240 
241 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
242   assert(getParent() && "Recipe not in any VPBasicBlock");
243   return getParent()->getRecipeList().erase(getIterator());
244 }
245 
246 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
247   removeFromParent();
248   insertAfter(InsertPos);
249 }
250 
251 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
252                               iplist<VPRecipeBase>::iterator I) {
253   removeFromParent();
254   insertBefore(BB, I);
255 }
256 
257 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
258   assert(OpType == OperationType::FPMathOp &&
259          "recipe doesn't have fast math flags");
260   FastMathFlags Res;
261   Res.setAllowReassoc(FMFs.AllowReassoc);
262   Res.setNoNaNs(FMFs.NoNaNs);
263   Res.setNoInfs(FMFs.NoInfs);
264   Res.setNoSignedZeros(FMFs.NoSignedZeros);
265   Res.setAllowReciprocal(FMFs.AllowReciprocal);
266   Res.setAllowContract(FMFs.AllowContract);
267   Res.setApproxFunc(FMFs.ApproxFunc);
268   return Res;
269 }
270 
271 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
272                              VPValue *A, VPValue *B, DebugLoc DL,
273                              const Twine &Name)
274     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
275                           Pred, DL),
276       Opcode(Opcode), Name(Name.str()) {
277   assert(Opcode == Instruction::ICmp &&
278          "only ICmp predicates supported at the moment");
279 }
280 
281 VPInstruction::VPInstruction(unsigned Opcode,
282                              std::initializer_list<VPValue *> Operands,
283                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
284     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
285       Opcode(Opcode), Name(Name.str()) {
286   // Make sure the VPInstruction is a floating-point operation.
287   assert(isFPMathOp() && "this op can't take fast-math flags");
288 }
289 
290 bool VPInstruction::doesGeneratePerAllLanes() const {
291   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
292 }
293 
294 bool VPInstruction::canGenerateScalarForFirstLane() const {
295   if (Instruction::isBinaryOp(getOpcode()))
296     return true;
297   if (isVectorToScalar())
298     return true;
299   switch (Opcode) {
300   case VPInstruction::BranchOnCond:
301   case VPInstruction::BranchOnCount:
302   case VPInstruction::CalculateTripCountMinusVF:
303   case VPInstruction::CanonicalIVIncrementForPart:
304   case VPInstruction::PtrAdd:
305   case VPInstruction::ExplicitVectorLength:
306     return true;
307   default:
308     return false;
309   }
310 }
311 
312 Value *VPInstruction::generatePerLane(VPTransformState &State,
313                                       const VPIteration &Lane) {
314   IRBuilderBase &Builder = State.Builder;
315 
316   assert(getOpcode() == VPInstruction::PtrAdd &&
317          "only PtrAdd opcodes are supported for now");
318   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
319                               State.get(getOperand(1), Lane), Name);
320 }
321 
322 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
323   IRBuilderBase &Builder = State.Builder;
324 
325   if (Instruction::isBinaryOp(getOpcode())) {
326     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
327     if (Part != 0 && vputils::onlyFirstPartUsed(this))
328       return State.get(this, 0, OnlyFirstLaneUsed);
329 
330     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
331     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
332     auto *Res =
333         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
334     if (auto *I = dyn_cast<Instruction>(Res))
335       setFlags(I);
336     return Res;
337   }
338 
339   switch (getOpcode()) {
340   case VPInstruction::Not: {
341     Value *A = State.get(getOperand(0), Part);
342     return Builder.CreateNot(A, Name);
343   }
344   case Instruction::ICmp: {
345     Value *A = State.get(getOperand(0), Part);
346     Value *B = State.get(getOperand(1), Part);
347     return Builder.CreateCmp(getPredicate(), A, B, Name);
348   }
349   case Instruction::Select: {
350     Value *Cond = State.get(getOperand(0), Part);
351     Value *Op1 = State.get(getOperand(1), Part);
352     Value *Op2 = State.get(getOperand(2), Part);
353     return Builder.CreateSelect(Cond, Op1, Op2, Name);
354   }
355   case VPInstruction::ActiveLaneMask: {
356     // Get first lane of vector induction variable.
357     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
358     // Get the original loop tripcount.
359     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
360 
361     // If this part of the active lane mask is scalar, generate the CMP directly
362     // to avoid unnecessary extracts.
363     if (State.VF.isScalar())
364       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
365                                Name);
366 
367     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
368     auto *PredTy = VectorType::get(Int1Ty, State.VF);
369     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
370                                    {PredTy, ScalarTC->getType()},
371                                    {VIVElem0, ScalarTC}, nullptr, Name);
372   }
373   case VPInstruction::FirstOrderRecurrenceSplice: {
374     // Generate code to combine the previous and current values in vector v3.
375     //
376     //   vector.ph:
377     //     v_init = vector(..., ..., ..., a[-1])
378     //     br vector.body
379     //
380     //   vector.body
381     //     i = phi [0, vector.ph], [i+4, vector.body]
382     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
383     //     v2 = a[i, i+1, i+2, i+3];
384     //     v3 = vector(v1(3), v2(0, 1, 2))
385 
386     // For the first part, use the recurrence phi (v1), otherwise v2.
387     auto *V1 = State.get(getOperand(0), 0);
388     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
389     if (!PartMinus1->getType()->isVectorTy())
390       return PartMinus1;
391     Value *V2 = State.get(getOperand(1), Part);
392     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
393   }
394   case VPInstruction::CalculateTripCountMinusVF: {
395     if (Part != 0)
396       return State.get(this, 0, /*IsScalar*/ true);
397 
398     Value *ScalarTC = State.get(getOperand(0), {0, 0});
399     Value *Step =
400         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
401     Value *Sub = Builder.CreateSub(ScalarTC, Step);
402     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
403     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
404     return Builder.CreateSelect(Cmp, Sub, Zero);
405   }
406   case VPInstruction::ExplicitVectorLength: {
407     // Compute EVL
408     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
409       assert(AVL->getType()->isIntegerTy() &&
410              "Requested vector length should be an integer.");
411 
412       // TODO: Add support for MaxSafeDist for correct loop emission.
413       assert(State.VF.isScalable() && "Expected scalable vector factor.");
414       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
415 
416       Value *EVL = State.Builder.CreateIntrinsic(
417           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
418           {AVL, VFArg, State.Builder.getTrue()});
419       return EVL;
420     };
421     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
422     // be outside of the main loop.
423     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
424     // Compute VTC - IV as the AVL (requested vector length).
425     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
426     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
427     Value *AVL = State.Builder.CreateSub(TripCount, Index);
428     Value *EVL = GetEVL(State, AVL);
429     return EVL;
430   }
431   case VPInstruction::CanonicalIVIncrementForPart: {
432     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
433     if (Part == 0)
434       return IV;
435 
436     // The canonical IV is incremented by the vectorization factor (num of SIMD
437     // elements) times the unroll part.
438     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
439     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
440                              hasNoSignedWrap());
441   }
442   case VPInstruction::BranchOnCond: {
443     if (Part != 0)
444       return nullptr;
445 
446     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
447     VPRegionBlock *ParentRegion = getParent()->getParent();
448     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
449 
450     // Replace the temporary unreachable terminator with a new conditional
451     // branch, hooking it up to backward destination for exiting blocks now and
452     // to forward destination(s) later when they are created.
453     BranchInst *CondBr =
454         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
455 
456     if (getParent()->isExiting())
457       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
458 
459     CondBr->setSuccessor(0, nullptr);
460     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
461     return CondBr;
462   }
463   case VPInstruction::BranchOnCount: {
464     if (Part != 0)
465       return nullptr;
466     // First create the compare.
467     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
468     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
469     Value *Cond = Builder.CreateICmpEQ(IV, TC);
470 
471     // Now create the branch.
472     auto *Plan = getParent()->getPlan();
473     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
474     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
475 
476     // Replace the temporary unreachable terminator with a new conditional
477     // branch, hooking it up to backward destination (the header) now and to the
478     // forward destination (the exit/middle block) later when it is created.
479     // Note that CreateCondBr expects a valid BB as first argument, so we need
480     // to set it to nullptr later.
481     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
482                                               State.CFG.VPBB2IRBB[Header]);
483     CondBr->setSuccessor(0, nullptr);
484     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
485     return CondBr;
486   }
487   case VPInstruction::ComputeReductionResult: {
488     if (Part != 0)
489       return State.get(this, 0, /*IsScalar*/ true);
490 
491     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
492     // and will be removed by breaking up the recipe further.
493     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
494     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
495     // Get its reduction variable descriptor.
496     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
497 
498     RecurKind RK = RdxDesc.getRecurrenceKind();
499 
500     VPValue *LoopExitingDef = getOperand(1);
501     Type *PhiTy = OrigPhi->getType();
502     VectorParts RdxParts(State.UF);
503     for (unsigned Part = 0; Part < State.UF; ++Part)
504       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
505 
506     // If the vector reduction can be performed in a smaller type, we truncate
507     // then extend the loop exit value to enable InstCombine to evaluate the
508     // entire expression in the smaller type.
509     // TODO: Handle this in truncateToMinBW.
510     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
511       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
512       for (unsigned Part = 0; Part < State.UF; ++Part)
513         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
514     }
515     // Reduce all of the unrolled parts into a single vector.
516     Value *ReducedPartRdx = RdxParts[0];
517     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
518     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
519       Op = Instruction::Or;
520 
521     if (PhiR->isOrdered()) {
522       ReducedPartRdx = RdxParts[State.UF - 1];
523     } else {
524       // Floating-point operations should have some FMF to enable the reduction.
525       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
526       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
527       for (unsigned Part = 1; Part < State.UF; ++Part) {
528         Value *RdxPart = RdxParts[Part];
529         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
530           ReducedPartRdx = Builder.CreateBinOp(
531               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
532         else
533           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
534       }
535     }
536 
537     // Create the reduction after the loop. Note that inloop reductions create
538     // the target reduction in the loop using a Reduction recipe.
539     if ((State.VF.isVector() ||
540          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
541         !PhiR->isInLoop()) {
542       ReducedPartRdx =
543           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
544       // If the reduction can be performed in a smaller type, we need to extend
545       // the reduction to the wider type before we branch to the original loop.
546       if (PhiTy != RdxDesc.getRecurrenceType())
547         ReducedPartRdx = RdxDesc.isSigned()
548                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
549                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
550     }
551 
552     // If there were stores of the reduction value to a uniform memory address
553     // inside the loop, create the final store here.
554     if (StoreInst *SI = RdxDesc.IntermediateStore) {
555       auto *NewSI = Builder.CreateAlignedStore(
556           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
557       propagateMetadata(NewSI, SI);
558     }
559 
560     return ReducedPartRdx;
561   }
562   case VPInstruction::ExtractFromEnd: {
563     if (Part != 0)
564       return State.get(this, 0, /*IsScalar*/ true);
565 
566     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
567     unsigned Offset = CI->getZExtValue();
568     assert(Offset > 0 && "Offset from end must be positive");
569     Value *Res;
570     if (State.VF.isVector()) {
571       assert(Offset <= State.VF.getKnownMinValue() &&
572              "invalid offset to extract from");
573       // Extract lane VF - Offset from the operand.
574       Res = State.get(
575           getOperand(0),
576           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
577     } else {
578       assert(Offset <= State.UF && "invalid offset to extract from");
579       // When loop is unrolled without vectorizing, retrieve UF - Offset.
580       Res = State.get(getOperand(0), State.UF - Offset);
581     }
582     Res->setName(Name);
583     return Res;
584   }
585   case VPInstruction::LogicalAnd: {
586     Value *A = State.get(getOperand(0), Part);
587     Value *B = State.get(getOperand(1), Part);
588     return Builder.CreateLogicalAnd(A, B, Name);
589   }
590   case VPInstruction::PtrAdd: {
591     assert(vputils::onlyFirstLaneUsed(this) &&
592            "can only generate first lane for PtrAdd");
593     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
594     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
595     return Builder.CreatePtrAdd(Ptr, Addend, Name);
596   }
597   default:
598     llvm_unreachable("Unsupported opcode for instruction");
599   }
600 }
601 
602 bool VPInstruction::isVectorToScalar() const {
603   return getOpcode() == VPInstruction::ExtractFromEnd ||
604          getOpcode() == VPInstruction::ComputeReductionResult;
605 }
606 
607 #if !defined(NDEBUG)
608 bool VPInstruction::isFPMathOp() const {
609   // Inspired by FPMathOperator::classof. Notable differences are that we don't
610   // support Call, PHI and Select opcodes here yet.
611   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
612          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
613          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
614          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
615 }
616 #endif
617 
618 void VPInstruction::execute(VPTransformState &State) {
619   assert(!State.Instance && "VPInstruction executing an Instance");
620   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
621   assert((hasFastMathFlags() == isFPMathOp() ||
622           getOpcode() == Instruction::Select) &&
623          "Recipe not a FPMathOp but has fast-math flags?");
624   if (hasFastMathFlags())
625     State.Builder.setFastMathFlags(getFastMathFlags());
626   State.setDebugLocFrom(getDebugLoc());
627   bool GeneratesPerFirstLaneOnly =
628       canGenerateScalarForFirstLane() &&
629       (vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
630   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
631   for (unsigned Part = 0; Part < State.UF; ++Part) {
632     if (GeneratesPerAllLanes) {
633       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
634            Lane != NumLanes; ++Lane) {
635         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
636         assert(GeneratedValue && "generatePerLane must produce a value");
637         State.set(this, GeneratedValue, VPIteration(Part, Lane));
638       }
639       continue;
640     }
641 
642     Value *GeneratedValue = generatePerPart(State, Part);
643     if (!hasResult())
644       continue;
645     assert(GeneratedValue && "generatePerPart must produce a value");
646     assert((GeneratedValue->getType()->isVectorTy() ==
647                 !GeneratesPerFirstLaneOnly ||
648             State.VF.isScalar()) &&
649            "scalar value but not only first lane defined");
650     State.set(this, GeneratedValue, Part,
651               /*IsScalar*/ GeneratesPerFirstLaneOnly);
652   }
653 }
654 
655 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
656   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
657   if (Instruction::isBinaryOp(getOpcode()))
658     return vputils::onlyFirstLaneUsed(this);
659 
660   switch (getOpcode()) {
661   default:
662     return false;
663   case Instruction::ICmp:
664   case VPInstruction::PtrAdd:
665     // TODO: Cover additional opcodes.
666     return vputils::onlyFirstLaneUsed(this);
667   case VPInstruction::ActiveLaneMask:
668   case VPInstruction::ExplicitVectorLength:
669   case VPInstruction::CalculateTripCountMinusVF:
670   case VPInstruction::CanonicalIVIncrementForPart:
671   case VPInstruction::BranchOnCount:
672     return true;
673   };
674   llvm_unreachable("switch should return");
675 }
676 
677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
678 void VPInstruction::dump() const {
679   VPSlotTracker SlotTracker(getParent()->getPlan());
680   print(dbgs(), "", SlotTracker);
681 }
682 
683 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
684                           VPSlotTracker &SlotTracker) const {
685   O << Indent << "EMIT ";
686 
687   if (hasResult()) {
688     printAsOperand(O, SlotTracker);
689     O << " = ";
690   }
691 
692   switch (getOpcode()) {
693   case VPInstruction::Not:
694     O << "not";
695     break;
696   case VPInstruction::SLPLoad:
697     O << "combined load";
698     break;
699   case VPInstruction::SLPStore:
700     O << "combined store";
701     break;
702   case VPInstruction::ActiveLaneMask:
703     O << "active lane mask";
704     break;
705   case VPInstruction::ExplicitVectorLength:
706     O << "EXPLICIT-VECTOR-LENGTH";
707     break;
708   case VPInstruction::FirstOrderRecurrenceSplice:
709     O << "first-order splice";
710     break;
711   case VPInstruction::BranchOnCond:
712     O << "branch-on-cond";
713     break;
714   case VPInstruction::CalculateTripCountMinusVF:
715     O << "TC > VF ? TC - VF : 0";
716     break;
717   case VPInstruction::CanonicalIVIncrementForPart:
718     O << "VF * Part +";
719     break;
720   case VPInstruction::BranchOnCount:
721     O << "branch-on-count";
722     break;
723   case VPInstruction::ExtractFromEnd:
724     O << "extract-from-end";
725     break;
726   case VPInstruction::ComputeReductionResult:
727     O << "compute-reduction-result";
728     break;
729   case VPInstruction::LogicalAnd:
730     O << "logical-and";
731     break;
732   case VPInstruction::PtrAdd:
733     O << "ptradd";
734     break;
735   default:
736     O << Instruction::getOpcodeName(getOpcode());
737   }
738 
739   printFlags(O);
740   printOperands(O, SlotTracker);
741 
742   if (auto DL = getDebugLoc()) {
743     O << ", !dbg ";
744     DL.print(O);
745   }
746 }
747 #endif
748 
749 void VPWidenCallRecipe::execute(VPTransformState &State) {
750   assert(State.VF.isVector() && "not widening");
751   Function *CalledScalarFn = getCalledScalarFunction();
752   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
753          "DbgInfoIntrinsic should have been dropped during VPlan construction");
754   State.setDebugLocFrom(getDebugLoc());
755 
756   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
757   FunctionType *VFTy = nullptr;
758   if (Variant)
759     VFTy = Variant->getFunctionType();
760   for (unsigned Part = 0; Part < State.UF; ++Part) {
761     SmallVector<Type *, 2> TysForDecl;
762     // Add return type if intrinsic is overloaded on it.
763     if (UseIntrinsic &&
764         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
765       TysForDecl.push_back(VectorType::get(
766           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
767     SmallVector<Value *, 4> Args;
768     for (const auto &I : enumerate(arg_operands())) {
769       // Some intrinsics have a scalar argument - don't replace it with a
770       // vector.
771       Value *Arg;
772       if (UseIntrinsic &&
773           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
774         Arg = State.get(I.value(), VPIteration(0, 0));
775       // Some vectorized function variants may also take a scalar argument,
776       // e.g. linear parameters for pointers. This needs to be the scalar value
777       // from the start of the respective part when interleaving.
778       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
779         Arg = State.get(I.value(), VPIteration(Part, 0));
780       else
781         Arg = State.get(I.value(), Part);
782       if (UseIntrinsic &&
783           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
784         TysForDecl.push_back(Arg->getType());
785       Args.push_back(Arg);
786     }
787 
788     Function *VectorF;
789     if (UseIntrinsic) {
790       // Use vector version of the intrinsic.
791       Module *M = State.Builder.GetInsertBlock()->getModule();
792       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
793       assert(VectorF && "Can't retrieve vector intrinsic.");
794     } else {
795 #ifndef NDEBUG
796       assert(Variant != nullptr && "Can't create vector function.");
797 #endif
798       VectorF = Variant;
799     }
800 
801     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
802     SmallVector<OperandBundleDef, 1> OpBundles;
803     if (CI)
804       CI->getOperandBundlesAsDefs(OpBundles);
805 
806     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
807 
808     if (isa<FPMathOperator>(V))
809       V->copyFastMathFlags(CI);
810 
811     if (!V->getType()->isVoidTy())
812       State.set(this, V, Part);
813     State.addMetadata(V, CI);
814   }
815 }
816 
817 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
818 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
819                               VPSlotTracker &SlotTracker) const {
820   O << Indent << "WIDEN-CALL ";
821 
822   Function *CalledFn = getCalledScalarFunction();
823   if (CalledFn->getReturnType()->isVoidTy())
824     O << "void ";
825   else {
826     printAsOperand(O, SlotTracker);
827     O << " = ";
828   }
829 
830   O << "call @" << CalledFn->getName() << "(";
831   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
832     Op->printAsOperand(O, SlotTracker);
833   });
834   O << ")";
835 
836   if (VectorIntrinsicID)
837     O << " (using vector intrinsic)";
838   else {
839     O << " (using library function";
840     if (Variant->hasName())
841       O << ": " << Variant->getName();
842     O << ")";
843   }
844 }
845 
846 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
847                                 VPSlotTracker &SlotTracker) const {
848   O << Indent << "WIDEN-SELECT ";
849   printAsOperand(O, SlotTracker);
850   O << " = select ";
851   getOperand(0)->printAsOperand(O, SlotTracker);
852   O << ", ";
853   getOperand(1)->printAsOperand(O, SlotTracker);
854   O << ", ";
855   getOperand(2)->printAsOperand(O, SlotTracker);
856   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
857 }
858 #endif
859 
860 void VPWidenSelectRecipe::execute(VPTransformState &State) {
861   State.setDebugLocFrom(getDebugLoc());
862 
863   // The condition can be loop invariant but still defined inside the
864   // loop. This means that we can't just use the original 'cond' value.
865   // We have to take the 'vectorized' value and pick the first lane.
866   // Instcombine will make this a no-op.
867   auto *InvarCond =
868       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
869 
870   for (unsigned Part = 0; Part < State.UF; ++Part) {
871     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
872     Value *Op0 = State.get(getOperand(1), Part);
873     Value *Op1 = State.get(getOperand(2), Part);
874     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
875     State.set(this, Sel, Part);
876     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
877   }
878 }
879 
880 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
881     const FastMathFlags &FMF) {
882   AllowReassoc = FMF.allowReassoc();
883   NoNaNs = FMF.noNaNs();
884   NoInfs = FMF.noInfs();
885   NoSignedZeros = FMF.noSignedZeros();
886   AllowReciprocal = FMF.allowReciprocal();
887   AllowContract = FMF.allowContract();
888   ApproxFunc = FMF.approxFunc();
889 }
890 
891 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
892 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
893   switch (OpType) {
894   case OperationType::Cmp:
895     O << " " << CmpInst::getPredicateName(getPredicate());
896     break;
897   case OperationType::DisjointOp:
898     if (DisjointFlags.IsDisjoint)
899       O << " disjoint";
900     break;
901   case OperationType::PossiblyExactOp:
902     if (ExactFlags.IsExact)
903       O << " exact";
904     break;
905   case OperationType::OverflowingBinOp:
906     if (WrapFlags.HasNUW)
907       O << " nuw";
908     if (WrapFlags.HasNSW)
909       O << " nsw";
910     break;
911   case OperationType::FPMathOp:
912     getFastMathFlags().print(O);
913     break;
914   case OperationType::GEPOp:
915     if (GEPFlags.IsInBounds)
916       O << " inbounds";
917     break;
918   case OperationType::NonNegOp:
919     if (NonNegFlags.NonNeg)
920       O << " nneg";
921     break;
922   case OperationType::Other:
923     break;
924   }
925   if (getNumOperands() > 0)
926     O << " ";
927 }
928 #endif
929 
930 void VPWidenRecipe::execute(VPTransformState &State) {
931   State.setDebugLocFrom(getDebugLoc());
932   auto &Builder = State.Builder;
933   switch (Opcode) {
934   case Instruction::Call:
935   case Instruction::Br:
936   case Instruction::PHI:
937   case Instruction::GetElementPtr:
938   case Instruction::Select:
939     llvm_unreachable("This instruction is handled by a different recipe.");
940   case Instruction::UDiv:
941   case Instruction::SDiv:
942   case Instruction::SRem:
943   case Instruction::URem:
944   case Instruction::Add:
945   case Instruction::FAdd:
946   case Instruction::Sub:
947   case Instruction::FSub:
948   case Instruction::FNeg:
949   case Instruction::Mul:
950   case Instruction::FMul:
951   case Instruction::FDiv:
952   case Instruction::FRem:
953   case Instruction::Shl:
954   case Instruction::LShr:
955   case Instruction::AShr:
956   case Instruction::And:
957   case Instruction::Or:
958   case Instruction::Xor: {
959     // Just widen unops and binops.
960     for (unsigned Part = 0; Part < State.UF; ++Part) {
961       SmallVector<Value *, 2> Ops;
962       for (VPValue *VPOp : operands())
963         Ops.push_back(State.get(VPOp, Part));
964 
965       Value *V = Builder.CreateNAryOp(Opcode, Ops);
966 
967       if (auto *VecOp = dyn_cast<Instruction>(V))
968         setFlags(VecOp);
969 
970       // Use this vector value for all users of the original instruction.
971       State.set(this, V, Part);
972       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
973     }
974 
975     break;
976   }
977   case Instruction::Freeze: {
978     for (unsigned Part = 0; Part < State.UF; ++Part) {
979       Value *Op = State.get(getOperand(0), Part);
980 
981       Value *Freeze = Builder.CreateFreeze(Op);
982       State.set(this, Freeze, Part);
983     }
984     break;
985   }
986   case Instruction::ICmp:
987   case Instruction::FCmp: {
988     // Widen compares. Generate vector compares.
989     bool FCmp = Opcode == Instruction::FCmp;
990     for (unsigned Part = 0; Part < State.UF; ++Part) {
991       Value *A = State.get(getOperand(0), Part);
992       Value *B = State.get(getOperand(1), Part);
993       Value *C = nullptr;
994       if (FCmp) {
995         // Propagate fast math flags.
996         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
997         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
998           Builder.setFastMathFlags(I->getFastMathFlags());
999         C = Builder.CreateFCmp(getPredicate(), A, B);
1000       } else {
1001         C = Builder.CreateICmp(getPredicate(), A, B);
1002       }
1003       State.set(this, C, Part);
1004       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1005     }
1006 
1007     break;
1008   }
1009   default:
1010     // This instruction is not vectorized by simple widening.
1011     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1012                       << Instruction::getOpcodeName(Opcode));
1013     llvm_unreachable("Unhandled instruction!");
1014   } // end of switch.
1015 
1016 #if !defined(NDEBUG)
1017   // Verify that VPlan type inference results agree with the type of the
1018   // generated values.
1019   for (unsigned Part = 0; Part < State.UF; ++Part) {
1020     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1021                            State.VF) == State.get(this, Part)->getType() &&
1022            "inferred type and type from generated instructions do not match");
1023   }
1024 #endif
1025 }
1026 
1027 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1028 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1029                           VPSlotTracker &SlotTracker) const {
1030   O << Indent << "WIDEN ";
1031   printAsOperand(O, SlotTracker);
1032   O << " = " << Instruction::getOpcodeName(Opcode);
1033   printFlags(O);
1034   printOperands(O, SlotTracker);
1035 }
1036 #endif
1037 
1038 void VPWidenCastRecipe::execute(VPTransformState &State) {
1039   State.setDebugLocFrom(getDebugLoc());
1040   auto &Builder = State.Builder;
1041   /// Vectorize casts.
1042   assert(State.VF.isVector() && "Not vectorizing?");
1043   Type *DestTy = VectorType::get(getResultType(), State.VF);
1044   VPValue *Op = getOperand(0);
1045   for (unsigned Part = 0; Part < State.UF; ++Part) {
1046     if (Part > 0 && Op->isLiveIn()) {
1047       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1048       State.set(this, State.get(this, 0), Part);
1049       continue;
1050     }
1051     Value *A = State.get(Op, Part);
1052     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1053     State.set(this, Cast, Part);
1054     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1055   }
1056 }
1057 
1058 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1059 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1060                               VPSlotTracker &SlotTracker) const {
1061   O << Indent << "WIDEN-CAST ";
1062   printAsOperand(O, SlotTracker);
1063   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1064   printFlags(O);
1065   printOperands(O, SlotTracker);
1066   O << " to " << *getResultType();
1067 }
1068 #endif
1069 
1070 /// This function adds
1071 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1072 /// to each vector element of Val. The sequence starts at StartIndex.
1073 /// \p Opcode is relevant for FP induction variable.
1074 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1075                             Instruction::BinaryOps BinOp, ElementCount VF,
1076                             IRBuilderBase &Builder) {
1077   assert(VF.isVector() && "only vector VFs are supported");
1078 
1079   // Create and check the types.
1080   auto *ValVTy = cast<VectorType>(Val->getType());
1081   ElementCount VLen = ValVTy->getElementCount();
1082 
1083   Type *STy = Val->getType()->getScalarType();
1084   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1085          "Induction Step must be an integer or FP");
1086   assert(Step->getType() == STy && "Step has wrong type");
1087 
1088   SmallVector<Constant *, 8> Indices;
1089 
1090   // Create a vector of consecutive numbers from zero to VF.
1091   VectorType *InitVecValVTy = ValVTy;
1092   if (STy->isFloatingPointTy()) {
1093     Type *InitVecValSTy =
1094         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1095     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1096   }
1097   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1098 
1099   // Splat the StartIdx
1100   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1101 
1102   if (STy->isIntegerTy()) {
1103     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1104     Step = Builder.CreateVectorSplat(VLen, Step);
1105     assert(Step->getType() == Val->getType() && "Invalid step vec");
1106     // FIXME: The newly created binary instructions should contain nsw/nuw
1107     // flags, which can be found from the original scalar operations.
1108     Step = Builder.CreateMul(InitVec, Step);
1109     return Builder.CreateAdd(Val, Step, "induction");
1110   }
1111 
1112   // Floating point induction.
1113   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1114          "Binary Opcode should be specified for FP induction");
1115   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1116   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1117 
1118   Step = Builder.CreateVectorSplat(VLen, Step);
1119   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1120   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1121 }
1122 
1123 /// A helper function that returns an integer or floating-point constant with
1124 /// value C.
1125 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1126   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1127                            : ConstantFP::get(Ty, C);
1128 }
1129 
1130 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1131                                   ElementCount VF) {
1132   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1133   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1134   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1135   return B.CreateUIToFP(RuntimeVF, FTy);
1136 }
1137 
1138 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1139   assert(!State.Instance && "Int or FP induction being replicated.");
1140 
1141   Value *Start = getStartValue()->getLiveInIRValue();
1142   const InductionDescriptor &ID = getInductionDescriptor();
1143   TruncInst *Trunc = getTruncInst();
1144   IRBuilderBase &Builder = State.Builder;
1145   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1146   assert(State.VF.isVector() && "must have vector VF");
1147 
1148   // The value from the original loop to which we are mapping the new induction
1149   // variable.
1150   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1151 
1152   // Fast-math-flags propagate from the original induction instruction.
1153   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1154   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1155     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1156 
1157   // Now do the actual transformations, and start with fetching the step value.
1158   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1159 
1160   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1161          "Expected either an induction phi-node or a truncate of it!");
1162 
1163   // Construct the initial value of the vector IV in the vector loop preheader
1164   auto CurrIP = Builder.saveIP();
1165   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1166   Builder.SetInsertPoint(VectorPH->getTerminator());
1167   if (isa<TruncInst>(EntryVal)) {
1168     assert(Start->getType()->isIntegerTy() &&
1169            "Truncation requires an integer type");
1170     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1171     Step = Builder.CreateTrunc(Step, TruncType);
1172     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1173   }
1174 
1175   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1176   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1177   Value *SteppedStart = getStepVector(
1178       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1179 
1180   // We create vector phi nodes for both integer and floating-point induction
1181   // variables. Here, we determine the kind of arithmetic we will perform.
1182   Instruction::BinaryOps AddOp;
1183   Instruction::BinaryOps MulOp;
1184   if (Step->getType()->isIntegerTy()) {
1185     AddOp = Instruction::Add;
1186     MulOp = Instruction::Mul;
1187   } else {
1188     AddOp = ID.getInductionOpcode();
1189     MulOp = Instruction::FMul;
1190   }
1191 
1192   // Multiply the vectorization factor by the step using integer or
1193   // floating-point arithmetic as appropriate.
1194   Type *StepType = Step->getType();
1195   Value *RuntimeVF;
1196   if (Step->getType()->isFloatingPointTy())
1197     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1198   else
1199     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1200   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1201 
1202   // Create a vector splat to use in the induction update.
1203   //
1204   // FIXME: If the step is non-constant, we create the vector splat with
1205   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1206   //        handle a constant vector splat.
1207   Value *SplatVF = isa<Constant>(Mul)
1208                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1209                        : Builder.CreateVectorSplat(State.VF, Mul);
1210   Builder.restoreIP(CurrIP);
1211 
1212   // We may need to add the step a number of times, depending on the unroll
1213   // factor. The last of those goes into the PHI.
1214   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1215   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1216   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1217   Instruction *LastInduction = VecInd;
1218   for (unsigned Part = 0; Part < State.UF; ++Part) {
1219     State.set(this, LastInduction, Part);
1220 
1221     if (isa<TruncInst>(EntryVal))
1222       State.addMetadata(LastInduction, EntryVal);
1223 
1224     LastInduction = cast<Instruction>(
1225         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1226     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1227   }
1228 
1229   LastInduction->setName("vec.ind.next");
1230   VecInd->addIncoming(SteppedStart, VectorPH);
1231   // Add induction update using an incorrect block temporarily. The phi node
1232   // will be fixed after VPlan execution. Note that at this point the latch
1233   // block cannot be used, as it does not exist yet.
1234   // TODO: Model increment value in VPlan, by turning the recipe into a
1235   // multi-def and a subclass of VPHeaderPHIRecipe.
1236   VecInd->addIncoming(LastInduction, VectorPH);
1237 }
1238 
1239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1240 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1241                                           VPSlotTracker &SlotTracker) const {
1242   O << Indent << "WIDEN-INDUCTION";
1243   if (getTruncInst()) {
1244     O << "\\l\"";
1245     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1246     O << " +\n" << Indent << "\"  ";
1247     getVPValue(0)->printAsOperand(O, SlotTracker);
1248   } else
1249     O << " " << VPlanIngredient(IV);
1250 
1251   O << ", ";
1252   getStepValue()->printAsOperand(O, SlotTracker);
1253 }
1254 #endif
1255 
1256 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1257   // The step may be defined by a recipe in the preheader (e.g. if it requires
1258   // SCEV expansion), but for the canonical induction the step is required to be
1259   // 1, which is represented as live-in.
1260   if (getStepValue()->getDefiningRecipe())
1261     return false;
1262   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1263   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1264   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1265   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1266          getScalarType() == CanIV->getScalarType();
1267 }
1268 
1269 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1270 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1271                               VPSlotTracker &SlotTracker) const {
1272   O << Indent;
1273   printAsOperand(O, SlotTracker);
1274   O << Indent << "= DERIVED-IV ";
1275   getStartValue()->printAsOperand(O, SlotTracker);
1276   O << " + ";
1277   getOperand(1)->printAsOperand(O, SlotTracker);
1278   O << " * ";
1279   getStepValue()->printAsOperand(O, SlotTracker);
1280 }
1281 #endif
1282 
1283 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1284   // Fast-math-flags propagate from the original induction instruction.
1285   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1286   if (hasFastMathFlags())
1287     State.Builder.setFastMathFlags(getFastMathFlags());
1288 
1289   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1290   /// variable on which to base the steps, \p Step is the size of the step.
1291 
1292   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1293   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1294   IRBuilderBase &Builder = State.Builder;
1295 
1296   // Ensure step has the same type as that of scalar IV.
1297   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1298   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1299 
1300   // We build scalar steps for both integer and floating-point induction
1301   // variables. Here, we determine the kind of arithmetic we will perform.
1302   Instruction::BinaryOps AddOp;
1303   Instruction::BinaryOps MulOp;
1304   if (BaseIVTy->isIntegerTy()) {
1305     AddOp = Instruction::Add;
1306     MulOp = Instruction::Mul;
1307   } else {
1308     AddOp = InductionOpcode;
1309     MulOp = Instruction::FMul;
1310   }
1311 
1312   // Determine the number of scalars we need to generate for each unroll
1313   // iteration.
1314   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1315   // Compute the scalar steps and save the results in State.
1316   Type *IntStepTy =
1317       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1318   Type *VecIVTy = nullptr;
1319   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1320   if (!FirstLaneOnly && State.VF.isScalable()) {
1321     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1322     UnitStepVec =
1323         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1324     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1325     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1326   }
1327 
1328   unsigned StartPart = 0;
1329   unsigned EndPart = State.UF;
1330   unsigned StartLane = 0;
1331   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1332   if (State.Instance) {
1333     StartPart = State.Instance->Part;
1334     EndPart = StartPart + 1;
1335     StartLane = State.Instance->Lane.getKnownLane();
1336     EndLane = StartLane + 1;
1337   }
1338   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1339     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1340 
1341     if (!FirstLaneOnly && State.VF.isScalable()) {
1342       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1343       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1344       if (BaseIVTy->isFloatingPointTy())
1345         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1346       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1347       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1348       State.set(this, Add, Part);
1349       // It's useful to record the lane values too for the known minimum number
1350       // of elements so we do those below. This improves the code quality when
1351       // trying to extract the first element, for example.
1352     }
1353 
1354     if (BaseIVTy->isFloatingPointTy())
1355       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1356 
1357     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1358       Value *StartIdx = Builder.CreateBinOp(
1359           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1360       // The step returned by `createStepForVF` is a runtime-evaluated value
1361       // when VF is scalable. Otherwise, it should be folded into a Constant.
1362       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1363              "Expected StartIdx to be folded to a constant when VF is not "
1364              "scalable");
1365       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1366       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1367       State.set(this, Add, VPIteration(Part, Lane));
1368     }
1369   }
1370 }
1371 
1372 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1373 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1374                                   VPSlotTracker &SlotTracker) const {
1375   O << Indent;
1376   printAsOperand(O, SlotTracker);
1377   O << " = SCALAR-STEPS ";
1378   printOperands(O, SlotTracker);
1379 }
1380 #endif
1381 
1382 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1383   assert(State.VF.isVector() && "not widening");
1384   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1385   // Construct a vector GEP by widening the operands of the scalar GEP as
1386   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1387   // results in a vector of pointers when at least one operand of the GEP
1388   // is vector-typed. Thus, to keep the representation compact, we only use
1389   // vector-typed operands for loop-varying values.
1390 
1391   if (areAllOperandsInvariant()) {
1392     // If we are vectorizing, but the GEP has only loop-invariant operands,
1393     // the GEP we build (by only using vector-typed operands for
1394     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1395     // produce a vector of pointers, we need to either arbitrarily pick an
1396     // operand to broadcast, or broadcast a clone of the original GEP.
1397     // Here, we broadcast a clone of the original.
1398     //
1399     // TODO: If at some point we decide to scalarize instructions having
1400     //       loop-invariant operands, this special case will no longer be
1401     //       required. We would add the scalarization decision to
1402     //       collectLoopScalars() and teach getVectorValue() to broadcast
1403     //       the lane-zero scalar value.
1404     SmallVector<Value *> Ops;
1405     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1406       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1407 
1408     auto *NewGEP =
1409         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1410                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1411     for (unsigned Part = 0; Part < State.UF; ++Part) {
1412       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1413       State.set(this, EntryPart, Part);
1414       State.addMetadata(EntryPart, GEP);
1415     }
1416   } else {
1417     // If the GEP has at least one loop-varying operand, we are sure to
1418     // produce a vector of pointers. But if we are only unrolling, we want
1419     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1420     // produce with the code below will be scalar (if VF == 1) or vector
1421     // (otherwise). Note that for the unroll-only case, we still maintain
1422     // values in the vector mapping with initVector, as we do for other
1423     // instructions.
1424     for (unsigned Part = 0; Part < State.UF; ++Part) {
1425       // The pointer operand of the new GEP. If it's loop-invariant, we
1426       // won't broadcast it.
1427       auto *Ptr = isPointerLoopInvariant()
1428                       ? State.get(getOperand(0), VPIteration(0, 0))
1429                       : State.get(getOperand(0), Part);
1430 
1431       // Collect all the indices for the new GEP. If any index is
1432       // loop-invariant, we won't broadcast it.
1433       SmallVector<Value *, 4> Indices;
1434       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1435         VPValue *Operand = getOperand(I);
1436         if (isIndexLoopInvariant(I - 1))
1437           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1438         else
1439           Indices.push_back(State.get(Operand, Part));
1440       }
1441 
1442       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1443       // but it should be a vector, otherwise.
1444       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1445                                              Indices, "", isInBounds());
1446       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1447              "NewGEP is not a pointer vector");
1448       State.set(this, NewGEP, Part);
1449       State.addMetadata(NewGEP, GEP);
1450     }
1451   }
1452 }
1453 
1454 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1455 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1456                              VPSlotTracker &SlotTracker) const {
1457   O << Indent << "WIDEN-GEP ";
1458   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1459   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1460     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1461 
1462   O << " ";
1463   printAsOperand(O, SlotTracker);
1464   O << " = getelementptr";
1465   printFlags(O);
1466   printOperands(O, SlotTracker);
1467 }
1468 #endif
1469 
1470 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1471   auto &Builder = State.Builder;
1472   State.setDebugLocFrom(getDebugLoc());
1473   for (unsigned Part = 0; Part < State.UF; ++Part) {
1474     // Calculate the pointer for the specific unroll-part.
1475     Value *PartPtr = nullptr;
1476     // Use i32 for the gep index type when the value is constant,
1477     // or query DataLayout for a more suitable index type otherwise.
1478     const DataLayout &DL =
1479         Builder.GetInsertBlock()->getModule()->getDataLayout();
1480     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1481                         ? DL.getIndexType(IndexedTy->getPointerTo())
1482                         : Builder.getInt32Ty();
1483     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1484     bool InBounds = isInBounds();
1485     if (IsReverse) {
1486       // If the address is consecutive but reversed, then the
1487       // wide store needs to start at the last vector element.
1488       // RunTimeVF =  VScale * VF.getKnownMinValue()
1489       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1490       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1491       // NumElt = -Part * RunTimeVF
1492       Value *NumElt = Builder.CreateMul(
1493           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1494       // LastLane = 1 - RunTimeVF
1495       Value *LastLane =
1496           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1497       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1498       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1499     } else {
1500       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1501       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1502     }
1503 
1504     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1505   }
1506 }
1507 
1508 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1509 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1510                                   VPSlotTracker &SlotTracker) const {
1511   O << Indent;
1512   printAsOperand(O, SlotTracker);
1513   O << " = vector-pointer ";
1514   if (IsReverse)
1515     O << "(reverse) ";
1516 
1517   printOperands(O, SlotTracker);
1518 }
1519 #endif
1520 
1521 void VPBlendRecipe::execute(VPTransformState &State) {
1522   State.setDebugLocFrom(getDebugLoc());
1523   // We know that all PHIs in non-header blocks are converted into
1524   // selects, so we don't have to worry about the insertion order and we
1525   // can just use the builder.
1526   // At this point we generate the predication tree. There may be
1527   // duplications since this is a simple recursive scan, but future
1528   // optimizations will clean it up.
1529 
1530   unsigned NumIncoming = getNumIncomingValues();
1531 
1532   // Generate a sequence of selects of the form:
1533   // SELECT(Mask3, In3,
1534   //        SELECT(Mask2, In2,
1535   //               SELECT(Mask1, In1,
1536   //                      In0)))
1537   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1538   // are essentially undef are taken from In0.
1539  VectorParts Entry(State.UF);
1540  bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1541  for (unsigned In = 0; In < NumIncoming; ++In) {
1542    for (unsigned Part = 0; Part < State.UF; ++Part) {
1543      // We might have single edge PHIs (blocks) - use an identity
1544      // 'select' for the first PHI operand.
1545      Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1546      if (In == 0)
1547        Entry[Part] = In0; // Initialize with the first incoming value.
1548      else {
1549        // Select between the current value and the previous incoming edge
1550        // based on the incoming mask.
1551        Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1552        Entry[Part] =
1553            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1554      }
1555    }
1556  }
1557   for (unsigned Part = 0; Part < State.UF; ++Part)
1558     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1559 }
1560 
1561 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1562 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1563                           VPSlotTracker &SlotTracker) const {
1564   O << Indent << "BLEND ";
1565   printAsOperand(O, SlotTracker);
1566   O << " =";
1567   if (getNumIncomingValues() == 1) {
1568     // Not a User of any mask: not really blending, this is a
1569     // single-predecessor phi.
1570     O << " ";
1571     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1572   } else {
1573     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1574       O << " ";
1575       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1576       if (I == 0)
1577         continue;
1578       O << "/";
1579       getMask(I)->printAsOperand(O, SlotTracker);
1580     }
1581   }
1582 }
1583 #endif
1584 
1585 void VPReductionRecipe::execute(VPTransformState &State) {
1586   assert(!State.Instance && "Reduction being replicated.");
1587   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1588   RecurKind Kind = RdxDesc.getRecurrenceKind();
1589   // Propagate the fast-math flags carried by the underlying instruction.
1590   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1591   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1592   for (unsigned Part = 0; Part < State.UF; ++Part) {
1593     Value *NewVecOp = State.get(getVecOp(), Part);
1594     if (VPValue *Cond = getCondOp()) {
1595       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1596       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1597       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1598       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1599                                                   RdxDesc.getFastMathFlags());
1600       if (State.VF.isVector()) {
1601         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1602       }
1603 
1604       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1605       NewVecOp = Select;
1606     }
1607     Value *NewRed;
1608     Value *NextInChain;
1609     if (IsOrdered) {
1610       if (State.VF.isVector())
1611         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1612                                         PrevInChain);
1613       else
1614         NewRed = State.Builder.CreateBinOp(
1615             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1616             NewVecOp);
1617       PrevInChain = NewRed;
1618     } else {
1619       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1620       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1621     }
1622     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1623       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1624                                    NewRed, PrevInChain);
1625     } else if (IsOrdered)
1626       NextInChain = NewRed;
1627     else
1628       NextInChain = State.Builder.CreateBinOp(
1629           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1630     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1631   }
1632 }
1633 
1634 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1635 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1636                               VPSlotTracker &SlotTracker) const {
1637   O << Indent << "REDUCE ";
1638   printAsOperand(O, SlotTracker);
1639   O << " = ";
1640   getChainOp()->printAsOperand(O, SlotTracker);
1641   O << " +";
1642   if (isa<FPMathOperator>(getUnderlyingInstr()))
1643     O << getUnderlyingInstr()->getFastMathFlags();
1644   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1645   getVecOp()->printAsOperand(O, SlotTracker);
1646   if (getCondOp()) {
1647     O << ", ";
1648     getCondOp()->printAsOperand(O, SlotTracker);
1649   }
1650   O << ")";
1651   if (RdxDesc.IntermediateStore)
1652     O << " (with final reduction value stored in invariant address sank "
1653          "outside of loop)";
1654 }
1655 #endif
1656 
1657 bool VPReplicateRecipe::shouldPack() const {
1658   // Find if the recipe is used by a widened recipe via an intervening
1659   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1660   return any_of(users(), [](const VPUser *U) {
1661     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1662       return any_of(PredR->users(), [PredR](const VPUser *U) {
1663         return !U->usesScalars(PredR);
1664       });
1665     return false;
1666   });
1667 }
1668 
1669 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1670 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1671                               VPSlotTracker &SlotTracker) const {
1672   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1673 
1674   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1675     printAsOperand(O, SlotTracker);
1676     O << " = ";
1677   }
1678   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1679     O << "call";
1680     printFlags(O);
1681     O << "@" << CB->getCalledFunction()->getName() << "(";
1682     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1683                     O, [&O, &SlotTracker](VPValue *Op) {
1684                       Op->printAsOperand(O, SlotTracker);
1685                     });
1686     O << ")";
1687   } else {
1688     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1689     printFlags(O);
1690     printOperands(O, SlotTracker);
1691   }
1692 
1693   if (shouldPack())
1694     O << " (S->V)";
1695 }
1696 #endif
1697 
1698 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1699 /// if it is either defined outside the vector region or its operand is known to
1700 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1701 /// TODO: Uniformity should be associated with a VPValue and there should be a
1702 /// generic way to check.
1703 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1704   return C->isDefinedOutsideVectorRegions() ||
1705          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1706          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1707 }
1708 
1709 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1710   assert(vputils::onlyFirstLaneUsed(this) &&
1711          "Codegen only implemented for first lane.");
1712   switch (Opcode) {
1713   case Instruction::SExt:
1714   case Instruction::ZExt:
1715   case Instruction::Trunc: {
1716     // Note: SExt/ZExt not used yet.
1717     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1718     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1719   }
1720   default:
1721     llvm_unreachable("opcode not implemented yet");
1722   }
1723 }
1724 
1725 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1726   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1727   for (unsigned Part = 0; Part != State.UF; ++Part) {
1728     Value *Res;
1729     // Only generate a single instance, if the recipe is uniform across UFs and
1730     // VFs.
1731     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1732       Res = State.get(this, VPIteration(0, 0));
1733     else
1734       Res = generate(State, Part);
1735     State.set(this, Res, VPIteration(Part, 0));
1736   }
1737 }
1738 
1739 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1740 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1741                                 VPSlotTracker &SlotTracker) const {
1742   O << Indent << "SCALAR-CAST ";
1743   printAsOperand(O, SlotTracker);
1744   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1745   printOperands(O, SlotTracker);
1746   O << " to " << *ResultTy;
1747 }
1748 #endif
1749 
1750 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1751   assert(State.Instance && "Branch on Mask works only on single instance.");
1752 
1753   unsigned Part = State.Instance->Part;
1754   unsigned Lane = State.Instance->Lane.getKnownLane();
1755 
1756   Value *ConditionBit = nullptr;
1757   VPValue *BlockInMask = getMask();
1758   if (BlockInMask) {
1759     ConditionBit = State.get(BlockInMask, Part);
1760     if (ConditionBit->getType()->isVectorTy())
1761       ConditionBit = State.Builder.CreateExtractElement(
1762           ConditionBit, State.Builder.getInt32(Lane));
1763   } else // Block in mask is all-one.
1764     ConditionBit = State.Builder.getTrue();
1765 
1766   // Replace the temporary unreachable terminator with a new conditional branch,
1767   // whose two destinations will be set later when they are created.
1768   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1769   assert(isa<UnreachableInst>(CurrentTerminator) &&
1770          "Expected to replace unreachable terminator with conditional branch.");
1771   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1772   CondBr->setSuccessor(0, nullptr);
1773   ReplaceInstWithInst(CurrentTerminator, CondBr);
1774 }
1775 
1776 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1777   assert(State.Instance && "Predicated instruction PHI works per instance.");
1778   Instruction *ScalarPredInst =
1779       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1780   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1781   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1782   assert(PredicatingBB && "Predicated block has no single predecessor.");
1783   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1784          "operand must be VPReplicateRecipe");
1785 
1786   // By current pack/unpack logic we need to generate only a single phi node: if
1787   // a vector value for the predicated instruction exists at this point it means
1788   // the instruction has vector users only, and a phi for the vector value is
1789   // needed. In this case the recipe of the predicated instruction is marked to
1790   // also do that packing, thereby "hoisting" the insert-element sequence.
1791   // Otherwise, a phi node for the scalar value is needed.
1792   unsigned Part = State.Instance->Part;
1793   if (State.hasVectorValue(getOperand(0), Part)) {
1794     Value *VectorValue = State.get(getOperand(0), Part);
1795     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1796     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1797     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1798     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1799     if (State.hasVectorValue(this, Part))
1800       State.reset(this, VPhi, Part);
1801     else
1802       State.set(this, VPhi, Part);
1803     // NOTE: Currently we need to update the value of the operand, so the next
1804     // predicated iteration inserts its generated value in the correct vector.
1805     State.reset(getOperand(0), VPhi, Part);
1806   } else {
1807     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1808     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1809     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1810                      PredicatingBB);
1811     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1812     if (State.hasScalarValue(this, *State.Instance))
1813       State.reset(this, Phi, *State.Instance);
1814     else
1815       State.set(this, Phi, *State.Instance);
1816     // NOTE: Currently we need to update the value of the operand, so the next
1817     // predicated iteration inserts its generated value in the correct vector.
1818     State.reset(getOperand(0), Phi, *State.Instance);
1819   }
1820 }
1821 
1822 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1823 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1824                                 VPSlotTracker &SlotTracker) const {
1825   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1826   printAsOperand(O, SlotTracker);
1827   O << " = ";
1828   printOperands(O, SlotTracker);
1829 }
1830 
1831 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1832                               VPSlotTracker &SlotTracker) const {
1833   O << Indent << "WIDEN ";
1834   printAsOperand(O, SlotTracker);
1835   O << " = load ";
1836   printOperands(O, SlotTracker);
1837 }
1838 
1839 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1840                                  VPSlotTracker &SlotTracker) const {
1841   O << Indent << "WIDEN ";
1842   printAsOperand(O, SlotTracker);
1843   O << " = vp.load ";
1844   printOperands(O, SlotTracker);
1845 }
1846 
1847 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1848                                VPSlotTracker &SlotTracker) const {
1849   O << Indent << "WIDEN store ";
1850   printOperands(O, SlotTracker);
1851 }
1852 
1853 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1854                                   VPSlotTracker &SlotTracker) const {
1855   O << Indent << "WIDEN vp.store ";
1856   printOperands(O, SlotTracker);
1857 }
1858 #endif
1859 
1860 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1861   Value *Start = getStartValue()->getLiveInIRValue();
1862   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1863   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1864 
1865   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1866   EntryPart->addIncoming(Start, VectorPH);
1867   EntryPart->setDebugLoc(getDebugLoc());
1868   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1869     State.set(this, EntryPart, Part, /*IsScalar*/ true);
1870 }
1871 
1872 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1873 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1874                                    VPSlotTracker &SlotTracker) const {
1875   O << Indent << "EMIT ";
1876   printAsOperand(O, SlotTracker);
1877   O << " = CANONICAL-INDUCTION ";
1878   printOperands(O, SlotTracker);
1879 }
1880 #endif
1881 
1882 bool VPCanonicalIVPHIRecipe::isCanonical(
1883     InductionDescriptor::InductionKind Kind, VPValue *Start,
1884     VPValue *Step) const {
1885   // Must be an integer induction.
1886   if (Kind != InductionDescriptor::IK_IntInduction)
1887     return false;
1888   // Start must match the start value of this canonical induction.
1889   if (Start != getStartValue())
1890     return false;
1891 
1892   // If the step is defined by a recipe, it is not a ConstantInt.
1893   if (Step->getDefiningRecipe())
1894     return false;
1895 
1896   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1897   return StepC && StepC->isOne();
1898 }
1899 
1900 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1901   return IsScalarAfterVectorization &&
1902          (!IsScalable || vputils::onlyFirstLaneUsed(this));
1903 }
1904 
1905 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1906 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1907                                           VPSlotTracker &SlotTracker) const {
1908   O << Indent << "EMIT ";
1909   printAsOperand(O, SlotTracker);
1910   O << " = WIDEN-POINTER-INDUCTION ";
1911   getStartValue()->printAsOperand(O, SlotTracker);
1912   O << ", " << *IndDesc.getStep();
1913 }
1914 #endif
1915 
1916 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1917   assert(!State.Instance && "cannot be used in per-lane");
1918   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1919   SCEVExpander Exp(SE, DL, "induction");
1920 
1921   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1922                                  &*State.Builder.GetInsertPoint());
1923   assert(!State.ExpandedSCEVs.contains(Expr) &&
1924          "Same SCEV expanded multiple times");
1925   State.ExpandedSCEVs[Expr] = Res;
1926   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1927     State.set(this, Res, {Part, 0});
1928 }
1929 
1930 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1931 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1932                                VPSlotTracker &SlotTracker) const {
1933   O << Indent << "EMIT ";
1934   getVPSingleValue()->printAsOperand(O, SlotTracker);
1935   O << " = EXPAND SCEV " << *Expr;
1936 }
1937 #endif
1938 
1939 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1940   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
1941   Type *STy = CanonicalIV->getType();
1942   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1943   ElementCount VF = State.VF;
1944   Value *VStart = VF.isScalar()
1945                       ? CanonicalIV
1946                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1947   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1948     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1949     if (VF.isVector()) {
1950       VStep = Builder.CreateVectorSplat(VF, VStep);
1951       VStep =
1952           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1953     }
1954     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1955     State.set(this, CanonicalVectorIV, Part);
1956   }
1957 }
1958 
1959 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1960 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1961                                      VPSlotTracker &SlotTracker) const {
1962   O << Indent << "EMIT ";
1963   printAsOperand(O, SlotTracker);
1964   O << " = WIDEN-CANONICAL-INDUCTION ";
1965   printOperands(O, SlotTracker);
1966 }
1967 #endif
1968 
1969 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1970   auto &Builder = State.Builder;
1971   // Create a vector from the initial value.
1972   auto *VectorInit = getStartValue()->getLiveInIRValue();
1973 
1974   Type *VecTy = State.VF.isScalar()
1975                     ? VectorInit->getType()
1976                     : VectorType::get(VectorInit->getType(), State.VF);
1977 
1978   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1979   if (State.VF.isVector()) {
1980     auto *IdxTy = Builder.getInt32Ty();
1981     auto *One = ConstantInt::get(IdxTy, 1);
1982     IRBuilder<>::InsertPointGuard Guard(Builder);
1983     Builder.SetInsertPoint(VectorPH->getTerminator());
1984     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1985     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1986     VectorInit = Builder.CreateInsertElement(
1987         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1988   }
1989 
1990   // Create a phi node for the new recurrence.
1991   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1992   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1993   EntryPart->addIncoming(VectorInit, VectorPH);
1994   State.set(this, EntryPart, 0);
1995 }
1996 
1997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1998 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1999                                             VPSlotTracker &SlotTracker) const {
2000   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2001   printAsOperand(O, SlotTracker);
2002   O << " = phi ";
2003   printOperands(O, SlotTracker);
2004 }
2005 #endif
2006 
2007 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2008   auto &Builder = State.Builder;
2009 
2010   // Reductions do not have to start at zero. They can start with
2011   // any loop invariant values.
2012   VPValue *StartVPV = getStartValue();
2013   Value *StartV = StartVPV->getLiveInIRValue();
2014 
2015   // In order to support recurrences we need to be able to vectorize Phi nodes.
2016   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2017   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2018   // this value when we vectorize all of the instructions that use the PHI.
2019   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2020   Type *VecTy = ScalarPHI ? StartV->getType()
2021                           : VectorType::get(StartV->getType(), State.VF);
2022 
2023   BasicBlock *HeaderBB = State.CFG.PrevBB;
2024   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2025          "recipe must be in the vector loop header");
2026   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2027   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2028     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2029     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2030     State.set(this, EntryPart, Part, IsInLoop);
2031   }
2032 
2033   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2034 
2035   Value *Iden = nullptr;
2036   RecurKind RK = RdxDesc.getRecurrenceKind();
2037   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2038       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2039     // MinMax and AnyOf reductions have the start value as their identity.
2040     if (ScalarPHI) {
2041       Iden = StartV;
2042     } else {
2043       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2044       Builder.SetInsertPoint(VectorPH->getTerminator());
2045       StartV = Iden =
2046           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2047     }
2048   } else {
2049     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2050                                          RdxDesc.getFastMathFlags());
2051 
2052     if (!ScalarPHI) {
2053       Iden = Builder.CreateVectorSplat(State.VF, Iden);
2054       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2055       Builder.SetInsertPoint(VectorPH->getTerminator());
2056       Constant *Zero = Builder.getInt32(0);
2057       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2058     }
2059   }
2060 
2061   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2062     Value *EntryPart = State.get(this, Part, IsInLoop);
2063     // Make sure to add the reduction start value only to the
2064     // first unroll part.
2065     Value *StartVal = (Part == 0) ? StartV : Iden;
2066     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2067   }
2068 }
2069 
2070 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2071 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2072                                  VPSlotTracker &SlotTracker) const {
2073   O << Indent << "WIDEN-REDUCTION-PHI ";
2074 
2075   printAsOperand(O, SlotTracker);
2076   O << " = phi ";
2077   printOperands(O, SlotTracker);
2078 }
2079 #endif
2080 
2081 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2082   assert(EnableVPlanNativePath &&
2083          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2084 
2085   Value *Op0 = State.get(getOperand(0), 0);
2086   Type *VecTy = Op0->getType();
2087   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2088   State.set(this, VecPhi, 0);
2089 }
2090 
2091 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2092 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2093                              VPSlotTracker &SlotTracker) const {
2094   O << Indent << "WIDEN-PHI ";
2095 
2096   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2097   // Unless all incoming values are modeled in VPlan  print the original PHI
2098   // directly.
2099   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2100   // values as VPValues.
2101   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2102     O << VPlanIngredient(OriginalPhi);
2103     return;
2104   }
2105 
2106   printAsOperand(O, SlotTracker);
2107   O << " = phi ";
2108   printOperands(O, SlotTracker);
2109 }
2110 #endif
2111 
2112 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2113 // remove VPActiveLaneMaskPHIRecipe.
2114 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2115   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2116   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2117     Value *StartMask = State.get(getOperand(0), Part);
2118     PHINode *EntryPart =
2119         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2120     EntryPart->addIncoming(StartMask, VectorPH);
2121     EntryPart->setDebugLoc(getDebugLoc());
2122     State.set(this, EntryPart, Part);
2123   }
2124 }
2125 
2126 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2127 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2128                                       VPSlotTracker &SlotTracker) const {
2129   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2130 
2131   printAsOperand(O, SlotTracker);
2132   O << " = phi ";
2133   printOperands(O, SlotTracker);
2134 }
2135 #endif
2136 
2137 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2138   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2139   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2140   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2141   PHINode *EntryPart =
2142       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2143   EntryPart->addIncoming(Start, VectorPH);
2144   EntryPart->setDebugLoc(getDebugLoc());
2145   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2146 }
2147 
2148 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2149 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2150                                   VPSlotTracker &SlotTracker) const {
2151   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2152 
2153   printAsOperand(O, SlotTracker);
2154   O << " = phi ";
2155   printOperands(O, SlotTracker);
2156 }
2157 #endif
2158