xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 46080abe9b136821eda2a1a27d8a13ceac349f8c)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenStoreEVLSC:
51   case VPWidenStoreSC:
52     return true;
53   case VPReplicateSC:
54     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
55         ->mayWriteToMemory();
56   case VPWidenCallSC:
57     return !cast<VPWidenCallRecipe>(this)
58                 ->getCalledScalarFunction()
59                 ->onlyReadsMemory();
60   case VPBranchOnMaskSC:
61   case VPScalarIVStepsSC:
62   case VPPredInstPHISC:
63     return false;
64   case VPBlendSC:
65   case VPReductionSC:
66   case VPWidenCanonicalIVSC:
67   case VPWidenCastSC:
68   case VPWidenGEPSC:
69   case VPWidenIntOrFpInductionSC:
70   case VPWidenLoadEVLSC:
71   case VPWidenLoadSC:
72   case VPWidenPHISC:
73   case VPWidenSC:
74   case VPWidenSelectSC: {
75     const Instruction *I =
76         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
77     (void)I;
78     assert((!I || !I->mayWriteToMemory()) &&
79            "underlying instruction may write to memory");
80     return false;
81   }
82   default:
83     return true;
84   }
85 }
86 
87 bool VPRecipeBase::mayReadFromMemory() const {
88   switch (getVPDefID()) {
89   case VPWidenLoadEVLSC:
90   case VPWidenLoadSC:
91     return true;
92   case VPReplicateSC:
93     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
94         ->mayReadFromMemory();
95   case VPWidenCallSC:
96     return !cast<VPWidenCallRecipe>(this)
97                 ->getCalledScalarFunction()
98                 ->onlyWritesMemory();
99   case VPBranchOnMaskSC:
100   case VPPredInstPHISC:
101   case VPScalarIVStepsSC:
102   case VPWidenStoreEVLSC:
103   case VPWidenStoreSC:
104     return false;
105   case VPBlendSC:
106   case VPReductionSC:
107   case VPWidenCanonicalIVSC:
108   case VPWidenCastSC:
109   case VPWidenGEPSC:
110   case VPWidenIntOrFpInductionSC:
111   case VPWidenPHISC:
112   case VPWidenSC:
113   case VPWidenSelectSC: {
114     const Instruction *I =
115         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
116     (void)I;
117     assert((!I || !I->mayReadFromMemory()) &&
118            "underlying instruction may read from memory");
119     return false;
120   }
121   default:
122     return true;
123   }
124 }
125 
126 bool VPRecipeBase::mayHaveSideEffects() const {
127   switch (getVPDefID()) {
128   case VPDerivedIVSC:
129   case VPPredInstPHISC:
130   case VPScalarCastSC:
131     return false;
132   case VPInstructionSC:
133     switch (cast<VPInstruction>(this)->getOpcode()) {
134     case Instruction::Or:
135     case Instruction::ICmp:
136     case Instruction::Select:
137     case VPInstruction::Not:
138     case VPInstruction::CalculateTripCountMinusVF:
139     case VPInstruction::CanonicalIVIncrementForPart:
140     case VPInstruction::ExtractFromEnd:
141     case VPInstruction::FirstOrderRecurrenceSplice:
142     case VPInstruction::LogicalAnd:
143     case VPInstruction::PtrAdd:
144       return false;
145     default:
146       return true;
147     }
148   case VPWidenCallSC: {
149     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
150     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
151   }
152   case VPBlendSC:
153   case VPReductionSC:
154   case VPScalarIVStepsSC:
155   case VPWidenCanonicalIVSC:
156   case VPWidenCastSC:
157   case VPWidenGEPSC:
158   case VPWidenIntOrFpInductionSC:
159   case VPWidenPHISC:
160   case VPWidenPointerInductionSC:
161   case VPWidenSC:
162   case VPWidenSelectSC: {
163     const Instruction *I =
164         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
165     (void)I;
166     assert((!I || !I->mayHaveSideEffects()) &&
167            "underlying instruction has side-effects");
168     return false;
169   }
170   case VPInterleaveSC:
171     return mayWriteToMemory();
172   case VPWidenLoadEVLSC:
173   case VPWidenLoadSC:
174   case VPWidenStoreEVLSC:
175   case VPWidenStoreSC:
176     assert(
177         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
178             mayWriteToMemory() &&
179         "mayHaveSideffects result for ingredient differs from this "
180         "implementation");
181     return mayWriteToMemory();
182   case VPReplicateSC: {
183     auto *R = cast<VPReplicateRecipe>(this);
184     return R->getUnderlyingInstr()->mayHaveSideEffects();
185   }
186   default:
187     return true;
188   }
189 }
190 
191 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
192   auto Lane = VPLane::getLastLaneForVF(State.VF);
193   VPValue *ExitValue = getOperand(0);
194   if (vputils::isUniformAfterVectorization(ExitValue))
195     Lane = VPLane::getFirstLane();
196   VPBasicBlock *MiddleVPBB =
197       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
198   assert(MiddleVPBB->getNumSuccessors() == 0 &&
199          "the middle block must not have any successors");
200   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
201   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
202                    MiddleBB);
203 }
204 
205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
207   O << "Live-out ";
208   getPhi()->printAsOperand(O);
209   O << " = ";
210   getOperand(0)->printAsOperand(O, SlotTracker);
211   O << "\n";
212 }
213 #endif
214 
215 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
216   assert(!Parent && "Recipe already in some VPBasicBlock");
217   assert(InsertPos->getParent() &&
218          "Insertion position not in any VPBasicBlock");
219   InsertPos->getParent()->insert(this, InsertPos->getIterator());
220 }
221 
222 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
223                                 iplist<VPRecipeBase>::iterator I) {
224   assert(!Parent && "Recipe already in some VPBasicBlock");
225   assert(I == BB.end() || I->getParent() == &BB);
226   BB.insert(this, I);
227 }
228 
229 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
230   assert(!Parent && "Recipe already in some VPBasicBlock");
231   assert(InsertPos->getParent() &&
232          "Insertion position not in any VPBasicBlock");
233   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
234 }
235 
236 void VPRecipeBase::removeFromParent() {
237   assert(getParent() && "Recipe not in any VPBasicBlock");
238   getParent()->getRecipeList().remove(getIterator());
239   Parent = nullptr;
240 }
241 
242 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
243   assert(getParent() && "Recipe not in any VPBasicBlock");
244   return getParent()->getRecipeList().erase(getIterator());
245 }
246 
247 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
248   removeFromParent();
249   insertAfter(InsertPos);
250 }
251 
252 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
253                               iplist<VPRecipeBase>::iterator I) {
254   removeFromParent();
255   insertBefore(BB, I);
256 }
257 
258 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
259   assert(OpType == OperationType::FPMathOp &&
260          "recipe doesn't have fast math flags");
261   FastMathFlags Res;
262   Res.setAllowReassoc(FMFs.AllowReassoc);
263   Res.setNoNaNs(FMFs.NoNaNs);
264   Res.setNoInfs(FMFs.NoInfs);
265   Res.setNoSignedZeros(FMFs.NoSignedZeros);
266   Res.setAllowReciprocal(FMFs.AllowReciprocal);
267   Res.setAllowContract(FMFs.AllowContract);
268   Res.setApproxFunc(FMFs.ApproxFunc);
269   return Res;
270 }
271 
272 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
273                              VPValue *A, VPValue *B, DebugLoc DL,
274                              const Twine &Name)
275     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
276                           Pred, DL),
277       Opcode(Opcode), Name(Name.str()) {
278   assert(Opcode == Instruction::ICmp &&
279          "only ICmp predicates supported at the moment");
280 }
281 
282 VPInstruction::VPInstruction(unsigned Opcode,
283                              std::initializer_list<VPValue *> Operands,
284                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
285     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
286       Opcode(Opcode), Name(Name.str()) {
287   // Make sure the VPInstruction is a floating-point operation.
288   assert(isFPMathOp() && "this op can't take fast-math flags");
289 }
290 
291 bool VPInstruction::doesGeneratePerAllLanes() const {
292   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
293 }
294 
295 bool VPInstruction::canGenerateScalarForFirstLane() const {
296   if (Instruction::isBinaryOp(getOpcode()))
297     return true;
298   if (isVectorToScalar())
299     return true;
300   switch (Opcode) {
301   case VPInstruction::BranchOnCond:
302   case VPInstruction::BranchOnCount:
303   case VPInstruction::CalculateTripCountMinusVF:
304   case VPInstruction::CanonicalIVIncrementForPart:
305   case VPInstruction::PtrAdd:
306   case VPInstruction::ExplicitVectorLength:
307     return true;
308   default:
309     return false;
310   }
311 }
312 
313 Value *VPInstruction::generatePerLane(VPTransformState &State,
314                                       const VPIteration &Lane) {
315   IRBuilderBase &Builder = State.Builder;
316 
317   assert(getOpcode() == VPInstruction::PtrAdd &&
318          "only PtrAdd opcodes are supported for now");
319   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
320                               State.get(getOperand(1), Lane), Name);
321 }
322 
323 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
324   IRBuilderBase &Builder = State.Builder;
325 
326   if (Instruction::isBinaryOp(getOpcode())) {
327     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
328     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
329     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
330     auto *Res =
331         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
332     if (auto *I = dyn_cast<Instruction>(Res))
333       setFlags(I);
334     return Res;
335   }
336 
337   switch (getOpcode()) {
338   case VPInstruction::Not: {
339     Value *A = State.get(getOperand(0), Part);
340     return Builder.CreateNot(A, Name);
341   }
342   case Instruction::ICmp: {
343     Value *A = State.get(getOperand(0), Part);
344     Value *B = State.get(getOperand(1), Part);
345     return Builder.CreateCmp(getPredicate(), A, B, Name);
346   }
347   case Instruction::Select: {
348     Value *Cond = State.get(getOperand(0), Part);
349     Value *Op1 = State.get(getOperand(1), Part);
350     Value *Op2 = State.get(getOperand(2), Part);
351     return Builder.CreateSelect(Cond, Op1, Op2, Name);
352   }
353   case VPInstruction::ActiveLaneMask: {
354     // Get first lane of vector induction variable.
355     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
356     // Get the original loop tripcount.
357     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
358 
359     // If this part of the active lane mask is scalar, generate the CMP directly
360     // to avoid unnecessary extracts.
361     if (State.VF.isScalar())
362       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
363                                Name);
364 
365     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
366     auto *PredTy = VectorType::get(Int1Ty, State.VF);
367     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
368                                    {PredTy, ScalarTC->getType()},
369                                    {VIVElem0, ScalarTC}, nullptr, Name);
370   }
371   case VPInstruction::FirstOrderRecurrenceSplice: {
372     // Generate code to combine the previous and current values in vector v3.
373     //
374     //   vector.ph:
375     //     v_init = vector(..., ..., ..., a[-1])
376     //     br vector.body
377     //
378     //   vector.body
379     //     i = phi [0, vector.ph], [i+4, vector.body]
380     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
381     //     v2 = a[i, i+1, i+2, i+3];
382     //     v3 = vector(v1(3), v2(0, 1, 2))
383 
384     // For the first part, use the recurrence phi (v1), otherwise v2.
385     auto *V1 = State.get(getOperand(0), 0);
386     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
387     if (!PartMinus1->getType()->isVectorTy())
388       return PartMinus1;
389     Value *V2 = State.get(getOperand(1), Part);
390     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
391   }
392   case VPInstruction::CalculateTripCountMinusVF: {
393     if (Part != 0)
394       return State.get(this, 0, /*IsScalar*/ true);
395 
396     Value *ScalarTC = State.get(getOperand(0), {0, 0});
397     Value *Step =
398         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
399     Value *Sub = Builder.CreateSub(ScalarTC, Step);
400     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
401     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
402     return Builder.CreateSelect(Cmp, Sub, Zero);
403   }
404   case VPInstruction::ExplicitVectorLength: {
405     // Compute EVL
406     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
407       assert(AVL->getType()->isIntegerTy() &&
408              "Requested vector length should be an integer.");
409 
410       // TODO: Add support for MaxSafeDist for correct loop emission.
411       assert(State.VF.isScalable() && "Expected scalable vector factor.");
412       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
413 
414       Value *EVL = State.Builder.CreateIntrinsic(
415           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
416           {AVL, VFArg, State.Builder.getTrue()});
417       return EVL;
418     };
419     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
420     // be outside of the main loop.
421     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
422     // Compute VTC - IV as the AVL (requested vector length).
423     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
424     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
425     Value *AVL = State.Builder.CreateSub(TripCount, Index);
426     Value *EVL = GetEVL(State, AVL);
427     return EVL;
428   }
429   case VPInstruction::CanonicalIVIncrementForPart: {
430     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
431     if (Part == 0)
432       return IV;
433 
434     // The canonical IV is incremented by the vectorization factor (num of SIMD
435     // elements) times the unroll part.
436     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
437     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
438                              hasNoSignedWrap());
439   }
440   case VPInstruction::BranchOnCond: {
441     if (Part != 0)
442       return nullptr;
443 
444     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
445     VPRegionBlock *ParentRegion = getParent()->getParent();
446     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
447 
448     // Replace the temporary unreachable terminator with a new conditional
449     // branch, hooking it up to backward destination for exiting blocks now and
450     // to forward destination(s) later when they are created.
451     BranchInst *CondBr =
452         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
453 
454     if (getParent()->isExiting())
455       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
456 
457     CondBr->setSuccessor(0, nullptr);
458     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
459     return CondBr;
460   }
461   case VPInstruction::BranchOnCount: {
462     if (Part != 0)
463       return nullptr;
464     // First create the compare.
465     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
466     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
467     Value *Cond = Builder.CreateICmpEQ(IV, TC);
468 
469     // Now create the branch.
470     auto *Plan = getParent()->getPlan();
471     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
472     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
473 
474     // Replace the temporary unreachable terminator with a new conditional
475     // branch, hooking it up to backward destination (the header) now and to the
476     // forward destination (the exit/middle block) later when it is created.
477     // Note that CreateCondBr expects a valid BB as first argument, so we need
478     // to set it to nullptr later.
479     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
480                                               State.CFG.VPBB2IRBB[Header]);
481     CondBr->setSuccessor(0, nullptr);
482     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
483     return CondBr;
484   }
485   case VPInstruction::ComputeReductionResult: {
486     if (Part != 0)
487       return State.get(this, 0, /*IsScalar*/ true);
488 
489     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
490     // and will be removed by breaking up the recipe further.
491     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
492     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
493     // Get its reduction variable descriptor.
494     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
495 
496     RecurKind RK = RdxDesc.getRecurrenceKind();
497 
498     VPValue *LoopExitingDef = getOperand(1);
499     Type *PhiTy = OrigPhi->getType();
500     VectorParts RdxParts(State.UF);
501     for (unsigned Part = 0; Part < State.UF; ++Part)
502       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
503 
504     // If the vector reduction can be performed in a smaller type, we truncate
505     // then extend the loop exit value to enable InstCombine to evaluate the
506     // entire expression in the smaller type.
507     // TODO: Handle this in truncateToMinBW.
508     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
509       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
510       for (unsigned Part = 0; Part < State.UF; ++Part)
511         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
512     }
513     // Reduce all of the unrolled parts into a single vector.
514     Value *ReducedPartRdx = RdxParts[0];
515     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
516     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
517       Op = Instruction::Or;
518 
519     if (PhiR->isOrdered()) {
520       ReducedPartRdx = RdxParts[State.UF - 1];
521     } else {
522       // Floating-point operations should have some FMF to enable the reduction.
523       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
524       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
525       for (unsigned Part = 1; Part < State.UF; ++Part) {
526         Value *RdxPart = RdxParts[Part];
527         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
528           ReducedPartRdx = Builder.CreateBinOp(
529               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
530         else
531           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
532       }
533     }
534 
535     // Create the reduction after the loop. Note that inloop reductions create
536     // the target reduction in the loop using a Reduction recipe.
537     if ((State.VF.isVector() ||
538          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
539         !PhiR->isInLoop()) {
540       ReducedPartRdx =
541           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
542       // If the reduction can be performed in a smaller type, we need to extend
543       // the reduction to the wider type before we branch to the original loop.
544       if (PhiTy != RdxDesc.getRecurrenceType())
545         ReducedPartRdx = RdxDesc.isSigned()
546                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
547                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
548     }
549 
550     // If there were stores of the reduction value to a uniform memory address
551     // inside the loop, create the final store here.
552     if (StoreInst *SI = RdxDesc.IntermediateStore) {
553       auto *NewSI = Builder.CreateAlignedStore(
554           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
555       propagateMetadata(NewSI, SI);
556     }
557 
558     return ReducedPartRdx;
559   }
560   case VPInstruction::ExtractFromEnd: {
561     if (Part != 0)
562       return State.get(this, 0, /*IsScalar*/ true);
563 
564     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
565     unsigned Offset = CI->getZExtValue();
566     assert(Offset > 0 && "Offset from end must be positive");
567     Value *Res;
568     if (State.VF.isVector()) {
569       assert(Offset <= State.VF.getKnownMinValue() &&
570              "invalid offset to extract from");
571       // Extract lane VF - Offset from the operand.
572       Res = State.get(
573           getOperand(0),
574           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
575     } else {
576       assert(Offset <= State.UF && "invalid offset to extract from");
577       // When loop is unrolled without vectorizing, retrieve UF - Offset.
578       Res = State.get(getOperand(0), State.UF - Offset);
579     }
580     Res->setName(Name);
581     return Res;
582   }
583   case VPInstruction::LogicalAnd: {
584     Value *A = State.get(getOperand(0), Part);
585     Value *B = State.get(getOperand(1), Part);
586     return Builder.CreateLogicalAnd(A, B, Name);
587   }
588   case VPInstruction::PtrAdd: {
589     assert(vputils::onlyFirstLaneUsed(this) &&
590            "can only generate first lane for PtrAdd");
591     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
592     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
593     return Builder.CreatePtrAdd(Ptr, Addend, Name);
594   }
595   default:
596     llvm_unreachable("Unsupported opcode for instruction");
597   }
598 }
599 
600 bool VPInstruction::isVectorToScalar() const {
601   return getOpcode() == VPInstruction::ExtractFromEnd ||
602          getOpcode() == VPInstruction::ComputeReductionResult;
603 }
604 
605 #if !defined(NDEBUG)
606 bool VPInstruction::isFPMathOp() const {
607   // Inspired by FPMathOperator::classof. Notable differences are that we don't
608   // support Call, PHI and Select opcodes here yet.
609   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
610          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
611          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
612          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
613 }
614 #endif
615 
616 void VPInstruction::execute(VPTransformState &State) {
617   assert(!State.Instance && "VPInstruction executing an Instance");
618   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
619   assert((hasFastMathFlags() == isFPMathOp() ||
620           getOpcode() == Instruction::Select) &&
621          "Recipe not a FPMathOp but has fast-math flags?");
622   if (hasFastMathFlags())
623     State.Builder.setFastMathFlags(getFastMathFlags());
624   State.setDebugLocFrom(getDebugLoc());
625   bool GeneratesPerFirstLaneOnly =
626       canGenerateScalarForFirstLane() &&
627       (vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
628   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
629   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
630   for (unsigned Part = 0; Part < State.UF; ++Part) {
631     if (GeneratesPerAllLanes) {
632       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
633            Lane != NumLanes; ++Lane) {
634         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
635         assert(GeneratedValue && "generatePerLane must produce a value");
636         State.set(this, GeneratedValue, VPIteration(Part, Lane));
637       }
638       continue;
639     }
640 
641     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
642       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
643       State.set(this, Part0, Part,
644                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
645       continue;
646     }
647 
648     Value *GeneratedValue = generatePerPart(State, Part);
649     if (!hasResult())
650       continue;
651     assert(GeneratedValue && "generatePerPart must produce a value");
652     assert((GeneratedValue->getType()->isVectorTy() ==
653                 !GeneratesPerFirstLaneOnly ||
654             State.VF.isScalar()) &&
655            "scalar value but not only first lane defined");
656     State.set(this, GeneratedValue, Part,
657               /*IsScalar*/ GeneratesPerFirstLaneOnly);
658   }
659 }
660 
661 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
662   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
663   if (Instruction::isBinaryOp(getOpcode()))
664     return vputils::onlyFirstLaneUsed(this);
665 
666   switch (getOpcode()) {
667   default:
668     return false;
669   case Instruction::ICmp:
670   case VPInstruction::PtrAdd:
671     // TODO: Cover additional opcodes.
672     return vputils::onlyFirstLaneUsed(this);
673   case VPInstruction::ActiveLaneMask:
674   case VPInstruction::ExplicitVectorLength:
675   case VPInstruction::CalculateTripCountMinusVF:
676   case VPInstruction::CanonicalIVIncrementForPart:
677   case VPInstruction::BranchOnCount:
678     return true;
679   };
680   llvm_unreachable("switch should return");
681 }
682 
683 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
684   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
685   if (Instruction::isBinaryOp(getOpcode()))
686     return vputils::onlyFirstPartUsed(this);
687 
688   switch (getOpcode()) {
689   default:
690     return false;
691   case Instruction::ICmp:
692   case Instruction::Select:
693     return vputils::onlyFirstPartUsed(this);
694   case VPInstruction::BranchOnCount:
695   case VPInstruction::BranchOnCond:
696   case VPInstruction::CanonicalIVIncrementForPart:
697     return true;
698   };
699   llvm_unreachable("switch should return");
700 }
701 
702 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
703 void VPInstruction::dump() const {
704   VPSlotTracker SlotTracker(getParent()->getPlan());
705   print(dbgs(), "", SlotTracker);
706 }
707 
708 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
709                           VPSlotTracker &SlotTracker) const {
710   O << Indent << "EMIT ";
711 
712   if (hasResult()) {
713     printAsOperand(O, SlotTracker);
714     O << " = ";
715   }
716 
717   switch (getOpcode()) {
718   case VPInstruction::Not:
719     O << "not";
720     break;
721   case VPInstruction::SLPLoad:
722     O << "combined load";
723     break;
724   case VPInstruction::SLPStore:
725     O << "combined store";
726     break;
727   case VPInstruction::ActiveLaneMask:
728     O << "active lane mask";
729     break;
730   case VPInstruction::ExplicitVectorLength:
731     O << "EXPLICIT-VECTOR-LENGTH";
732     break;
733   case VPInstruction::FirstOrderRecurrenceSplice:
734     O << "first-order splice";
735     break;
736   case VPInstruction::BranchOnCond:
737     O << "branch-on-cond";
738     break;
739   case VPInstruction::CalculateTripCountMinusVF:
740     O << "TC > VF ? TC - VF : 0";
741     break;
742   case VPInstruction::CanonicalIVIncrementForPart:
743     O << "VF * Part +";
744     break;
745   case VPInstruction::BranchOnCount:
746     O << "branch-on-count";
747     break;
748   case VPInstruction::ExtractFromEnd:
749     O << "extract-from-end";
750     break;
751   case VPInstruction::ComputeReductionResult:
752     O << "compute-reduction-result";
753     break;
754   case VPInstruction::LogicalAnd:
755     O << "logical-and";
756     break;
757   case VPInstruction::PtrAdd:
758     O << "ptradd";
759     break;
760   default:
761     O << Instruction::getOpcodeName(getOpcode());
762   }
763 
764   printFlags(O);
765   printOperands(O, SlotTracker);
766 
767   if (auto DL = getDebugLoc()) {
768     O << ", !dbg ";
769     DL.print(O);
770   }
771 }
772 #endif
773 
774 void VPWidenCallRecipe::execute(VPTransformState &State) {
775   assert(State.VF.isVector() && "not widening");
776   Function *CalledScalarFn = getCalledScalarFunction();
777   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
778          "DbgInfoIntrinsic should have been dropped during VPlan construction");
779   State.setDebugLocFrom(getDebugLoc());
780 
781   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
782   FunctionType *VFTy = nullptr;
783   if (Variant)
784     VFTy = Variant->getFunctionType();
785   for (unsigned Part = 0; Part < State.UF; ++Part) {
786     SmallVector<Type *, 2> TysForDecl;
787     // Add return type if intrinsic is overloaded on it.
788     if (UseIntrinsic &&
789         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
790       TysForDecl.push_back(VectorType::get(
791           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
792     SmallVector<Value *, 4> Args;
793     for (const auto &I : enumerate(arg_operands())) {
794       // Some intrinsics have a scalar argument - don't replace it with a
795       // vector.
796       Value *Arg;
797       if (UseIntrinsic &&
798           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
799         Arg = State.get(I.value(), VPIteration(0, 0));
800       // Some vectorized function variants may also take a scalar argument,
801       // e.g. linear parameters for pointers. This needs to be the scalar value
802       // from the start of the respective part when interleaving.
803       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
804         Arg = State.get(I.value(), VPIteration(Part, 0));
805       else
806         Arg = State.get(I.value(), Part);
807       if (UseIntrinsic &&
808           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
809         TysForDecl.push_back(Arg->getType());
810       Args.push_back(Arg);
811     }
812 
813     Function *VectorF;
814     if (UseIntrinsic) {
815       // Use vector version of the intrinsic.
816       Module *M = State.Builder.GetInsertBlock()->getModule();
817       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
818       assert(VectorF && "Can't retrieve vector intrinsic.");
819     } else {
820 #ifndef NDEBUG
821       assert(Variant != nullptr && "Can't create vector function.");
822 #endif
823       VectorF = Variant;
824     }
825 
826     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
827     SmallVector<OperandBundleDef, 1> OpBundles;
828     if (CI)
829       CI->getOperandBundlesAsDefs(OpBundles);
830 
831     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
832 
833     if (isa<FPMathOperator>(V))
834       V->copyFastMathFlags(CI);
835 
836     if (!V->getType()->isVoidTy())
837       State.set(this, V, Part);
838     State.addMetadata(V, CI);
839   }
840 }
841 
842 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
843 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
844                               VPSlotTracker &SlotTracker) const {
845   O << Indent << "WIDEN-CALL ";
846 
847   Function *CalledFn = getCalledScalarFunction();
848   if (CalledFn->getReturnType()->isVoidTy())
849     O << "void ";
850   else {
851     printAsOperand(O, SlotTracker);
852     O << " = ";
853   }
854 
855   O << "call @" << CalledFn->getName() << "(";
856   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
857     Op->printAsOperand(O, SlotTracker);
858   });
859   O << ")";
860 
861   if (VectorIntrinsicID)
862     O << " (using vector intrinsic)";
863   else {
864     O << " (using library function";
865     if (Variant->hasName())
866       O << ": " << Variant->getName();
867     O << ")";
868   }
869 }
870 
871 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
872                                 VPSlotTracker &SlotTracker) const {
873   O << Indent << "WIDEN-SELECT ";
874   printAsOperand(O, SlotTracker);
875   O << " = select ";
876   getOperand(0)->printAsOperand(O, SlotTracker);
877   O << ", ";
878   getOperand(1)->printAsOperand(O, SlotTracker);
879   O << ", ";
880   getOperand(2)->printAsOperand(O, SlotTracker);
881   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
882 }
883 #endif
884 
885 void VPWidenSelectRecipe::execute(VPTransformState &State) {
886   State.setDebugLocFrom(getDebugLoc());
887 
888   // The condition can be loop invariant but still defined inside the
889   // loop. This means that we can't just use the original 'cond' value.
890   // We have to take the 'vectorized' value and pick the first lane.
891   // Instcombine will make this a no-op.
892   auto *InvarCond =
893       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
894 
895   for (unsigned Part = 0; Part < State.UF; ++Part) {
896     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
897     Value *Op0 = State.get(getOperand(1), Part);
898     Value *Op1 = State.get(getOperand(2), Part);
899     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
900     State.set(this, Sel, Part);
901     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
902   }
903 }
904 
905 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
906     const FastMathFlags &FMF) {
907   AllowReassoc = FMF.allowReassoc();
908   NoNaNs = FMF.noNaNs();
909   NoInfs = FMF.noInfs();
910   NoSignedZeros = FMF.noSignedZeros();
911   AllowReciprocal = FMF.allowReciprocal();
912   AllowContract = FMF.allowContract();
913   ApproxFunc = FMF.approxFunc();
914 }
915 
916 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
917 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
918   switch (OpType) {
919   case OperationType::Cmp:
920     O << " " << CmpInst::getPredicateName(getPredicate());
921     break;
922   case OperationType::DisjointOp:
923     if (DisjointFlags.IsDisjoint)
924       O << " disjoint";
925     break;
926   case OperationType::PossiblyExactOp:
927     if (ExactFlags.IsExact)
928       O << " exact";
929     break;
930   case OperationType::OverflowingBinOp:
931     if (WrapFlags.HasNUW)
932       O << " nuw";
933     if (WrapFlags.HasNSW)
934       O << " nsw";
935     break;
936   case OperationType::FPMathOp:
937     getFastMathFlags().print(O);
938     break;
939   case OperationType::GEPOp:
940     if (GEPFlags.IsInBounds)
941       O << " inbounds";
942     break;
943   case OperationType::NonNegOp:
944     if (NonNegFlags.NonNeg)
945       O << " nneg";
946     break;
947   case OperationType::Other:
948     break;
949   }
950   if (getNumOperands() > 0)
951     O << " ";
952 }
953 #endif
954 
955 void VPWidenRecipe::execute(VPTransformState &State) {
956   State.setDebugLocFrom(getDebugLoc());
957   auto &Builder = State.Builder;
958   switch (Opcode) {
959   case Instruction::Call:
960   case Instruction::Br:
961   case Instruction::PHI:
962   case Instruction::GetElementPtr:
963   case Instruction::Select:
964     llvm_unreachable("This instruction is handled by a different recipe.");
965   case Instruction::UDiv:
966   case Instruction::SDiv:
967   case Instruction::SRem:
968   case Instruction::URem:
969   case Instruction::Add:
970   case Instruction::FAdd:
971   case Instruction::Sub:
972   case Instruction::FSub:
973   case Instruction::FNeg:
974   case Instruction::Mul:
975   case Instruction::FMul:
976   case Instruction::FDiv:
977   case Instruction::FRem:
978   case Instruction::Shl:
979   case Instruction::LShr:
980   case Instruction::AShr:
981   case Instruction::And:
982   case Instruction::Or:
983   case Instruction::Xor: {
984     // Just widen unops and binops.
985     for (unsigned Part = 0; Part < State.UF; ++Part) {
986       SmallVector<Value *, 2> Ops;
987       for (VPValue *VPOp : operands())
988         Ops.push_back(State.get(VPOp, Part));
989 
990       Value *V = Builder.CreateNAryOp(Opcode, Ops);
991 
992       if (auto *VecOp = dyn_cast<Instruction>(V))
993         setFlags(VecOp);
994 
995       // Use this vector value for all users of the original instruction.
996       State.set(this, V, Part);
997       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
998     }
999 
1000     break;
1001   }
1002   case Instruction::Freeze: {
1003     for (unsigned Part = 0; Part < State.UF; ++Part) {
1004       Value *Op = State.get(getOperand(0), Part);
1005 
1006       Value *Freeze = Builder.CreateFreeze(Op);
1007       State.set(this, Freeze, Part);
1008     }
1009     break;
1010   }
1011   case Instruction::ICmp:
1012   case Instruction::FCmp: {
1013     // Widen compares. Generate vector compares.
1014     bool FCmp = Opcode == Instruction::FCmp;
1015     for (unsigned Part = 0; Part < State.UF; ++Part) {
1016       Value *A = State.get(getOperand(0), Part);
1017       Value *B = State.get(getOperand(1), Part);
1018       Value *C = nullptr;
1019       if (FCmp) {
1020         // Propagate fast math flags.
1021         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1022         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1023           Builder.setFastMathFlags(I->getFastMathFlags());
1024         C = Builder.CreateFCmp(getPredicate(), A, B);
1025       } else {
1026         C = Builder.CreateICmp(getPredicate(), A, B);
1027       }
1028       State.set(this, C, Part);
1029       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1030     }
1031 
1032     break;
1033   }
1034   default:
1035     // This instruction is not vectorized by simple widening.
1036     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1037                       << Instruction::getOpcodeName(Opcode));
1038     llvm_unreachable("Unhandled instruction!");
1039   } // end of switch.
1040 
1041 #if !defined(NDEBUG)
1042   // Verify that VPlan type inference results agree with the type of the
1043   // generated values.
1044   for (unsigned Part = 0; Part < State.UF; ++Part) {
1045     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1046                            State.VF) == State.get(this, Part)->getType() &&
1047            "inferred type and type from generated instructions do not match");
1048   }
1049 #endif
1050 }
1051 
1052 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1053 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1054                           VPSlotTracker &SlotTracker) const {
1055   O << Indent << "WIDEN ";
1056   printAsOperand(O, SlotTracker);
1057   O << " = " << Instruction::getOpcodeName(Opcode);
1058   printFlags(O);
1059   printOperands(O, SlotTracker);
1060 }
1061 #endif
1062 
1063 void VPWidenCastRecipe::execute(VPTransformState &State) {
1064   State.setDebugLocFrom(getDebugLoc());
1065   auto &Builder = State.Builder;
1066   /// Vectorize casts.
1067   assert(State.VF.isVector() && "Not vectorizing?");
1068   Type *DestTy = VectorType::get(getResultType(), State.VF);
1069   VPValue *Op = getOperand(0);
1070   for (unsigned Part = 0; Part < State.UF; ++Part) {
1071     if (Part > 0 && Op->isLiveIn()) {
1072       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1073       State.set(this, State.get(this, 0), Part);
1074       continue;
1075     }
1076     Value *A = State.get(Op, Part);
1077     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1078     State.set(this, Cast, Part);
1079     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1080   }
1081 }
1082 
1083 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1084 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1085                               VPSlotTracker &SlotTracker) const {
1086   O << Indent << "WIDEN-CAST ";
1087   printAsOperand(O, SlotTracker);
1088   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1089   printFlags(O);
1090   printOperands(O, SlotTracker);
1091   O << " to " << *getResultType();
1092 }
1093 #endif
1094 
1095 /// This function adds
1096 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1097 /// to each vector element of Val. The sequence starts at StartIndex.
1098 /// \p Opcode is relevant for FP induction variable.
1099 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1100                             Instruction::BinaryOps BinOp, ElementCount VF,
1101                             IRBuilderBase &Builder) {
1102   assert(VF.isVector() && "only vector VFs are supported");
1103 
1104   // Create and check the types.
1105   auto *ValVTy = cast<VectorType>(Val->getType());
1106   ElementCount VLen = ValVTy->getElementCount();
1107 
1108   Type *STy = Val->getType()->getScalarType();
1109   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1110          "Induction Step must be an integer or FP");
1111   assert(Step->getType() == STy && "Step has wrong type");
1112 
1113   SmallVector<Constant *, 8> Indices;
1114 
1115   // Create a vector of consecutive numbers from zero to VF.
1116   VectorType *InitVecValVTy = ValVTy;
1117   if (STy->isFloatingPointTy()) {
1118     Type *InitVecValSTy =
1119         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1120     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1121   }
1122   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1123 
1124   // Splat the StartIdx
1125   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1126 
1127   if (STy->isIntegerTy()) {
1128     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1129     Step = Builder.CreateVectorSplat(VLen, Step);
1130     assert(Step->getType() == Val->getType() && "Invalid step vec");
1131     // FIXME: The newly created binary instructions should contain nsw/nuw
1132     // flags, which can be found from the original scalar operations.
1133     Step = Builder.CreateMul(InitVec, Step);
1134     return Builder.CreateAdd(Val, Step, "induction");
1135   }
1136 
1137   // Floating point induction.
1138   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1139          "Binary Opcode should be specified for FP induction");
1140   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1141   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1142 
1143   Step = Builder.CreateVectorSplat(VLen, Step);
1144   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1145   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1146 }
1147 
1148 /// A helper function that returns an integer or floating-point constant with
1149 /// value C.
1150 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1151   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1152                            : ConstantFP::get(Ty, C);
1153 }
1154 
1155 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1156                                   ElementCount VF) {
1157   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1158   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1159   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1160   return B.CreateUIToFP(RuntimeVF, FTy);
1161 }
1162 
1163 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1164   assert(!State.Instance && "Int or FP induction being replicated.");
1165 
1166   Value *Start = getStartValue()->getLiveInIRValue();
1167   const InductionDescriptor &ID = getInductionDescriptor();
1168   TruncInst *Trunc = getTruncInst();
1169   IRBuilderBase &Builder = State.Builder;
1170   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1171   assert(State.VF.isVector() && "must have vector VF");
1172 
1173   // The value from the original loop to which we are mapping the new induction
1174   // variable.
1175   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1176 
1177   // Fast-math-flags propagate from the original induction instruction.
1178   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1179   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1180     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1181 
1182   // Now do the actual transformations, and start with fetching the step value.
1183   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1184 
1185   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1186          "Expected either an induction phi-node or a truncate of it!");
1187 
1188   // Construct the initial value of the vector IV in the vector loop preheader
1189   auto CurrIP = Builder.saveIP();
1190   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1191   Builder.SetInsertPoint(VectorPH->getTerminator());
1192   if (isa<TruncInst>(EntryVal)) {
1193     assert(Start->getType()->isIntegerTy() &&
1194            "Truncation requires an integer type");
1195     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1196     Step = Builder.CreateTrunc(Step, TruncType);
1197     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1198   }
1199 
1200   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1201   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1202   Value *SteppedStart = getStepVector(
1203       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1204 
1205   // We create vector phi nodes for both integer and floating-point induction
1206   // variables. Here, we determine the kind of arithmetic we will perform.
1207   Instruction::BinaryOps AddOp;
1208   Instruction::BinaryOps MulOp;
1209   if (Step->getType()->isIntegerTy()) {
1210     AddOp = Instruction::Add;
1211     MulOp = Instruction::Mul;
1212   } else {
1213     AddOp = ID.getInductionOpcode();
1214     MulOp = Instruction::FMul;
1215   }
1216 
1217   // Multiply the vectorization factor by the step using integer or
1218   // floating-point arithmetic as appropriate.
1219   Type *StepType = Step->getType();
1220   Value *RuntimeVF;
1221   if (Step->getType()->isFloatingPointTy())
1222     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1223   else
1224     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1225   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1226 
1227   // Create a vector splat to use in the induction update.
1228   //
1229   // FIXME: If the step is non-constant, we create the vector splat with
1230   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1231   //        handle a constant vector splat.
1232   Value *SplatVF = isa<Constant>(Mul)
1233                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1234                        : Builder.CreateVectorSplat(State.VF, Mul);
1235   Builder.restoreIP(CurrIP);
1236 
1237   // We may need to add the step a number of times, depending on the unroll
1238   // factor. The last of those goes into the PHI.
1239   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1240   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1241   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1242   Instruction *LastInduction = VecInd;
1243   for (unsigned Part = 0; Part < State.UF; ++Part) {
1244     State.set(this, LastInduction, Part);
1245 
1246     if (isa<TruncInst>(EntryVal))
1247       State.addMetadata(LastInduction, EntryVal);
1248 
1249     LastInduction = cast<Instruction>(
1250         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1251     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1252   }
1253 
1254   LastInduction->setName("vec.ind.next");
1255   VecInd->addIncoming(SteppedStart, VectorPH);
1256   // Add induction update using an incorrect block temporarily. The phi node
1257   // will be fixed after VPlan execution. Note that at this point the latch
1258   // block cannot be used, as it does not exist yet.
1259   // TODO: Model increment value in VPlan, by turning the recipe into a
1260   // multi-def and a subclass of VPHeaderPHIRecipe.
1261   VecInd->addIncoming(LastInduction, VectorPH);
1262 }
1263 
1264 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1265 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1266                                           VPSlotTracker &SlotTracker) const {
1267   O << Indent << "WIDEN-INDUCTION";
1268   if (getTruncInst()) {
1269     O << "\\l\"";
1270     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1271     O << " +\n" << Indent << "\"  ";
1272     getVPValue(0)->printAsOperand(O, SlotTracker);
1273   } else
1274     O << " " << VPlanIngredient(IV);
1275 
1276   O << ", ";
1277   getStepValue()->printAsOperand(O, SlotTracker);
1278 }
1279 #endif
1280 
1281 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1282   // The step may be defined by a recipe in the preheader (e.g. if it requires
1283   // SCEV expansion), but for the canonical induction the step is required to be
1284   // 1, which is represented as live-in.
1285   if (getStepValue()->getDefiningRecipe())
1286     return false;
1287   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1288   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1289   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1290   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1291          getScalarType() == CanIV->getScalarType();
1292 }
1293 
1294 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1295 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1296                               VPSlotTracker &SlotTracker) const {
1297   O << Indent;
1298   printAsOperand(O, SlotTracker);
1299   O << Indent << "= DERIVED-IV ";
1300   getStartValue()->printAsOperand(O, SlotTracker);
1301   O << " + ";
1302   getOperand(1)->printAsOperand(O, SlotTracker);
1303   O << " * ";
1304   getStepValue()->printAsOperand(O, SlotTracker);
1305 }
1306 #endif
1307 
1308 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1309   // Fast-math-flags propagate from the original induction instruction.
1310   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1311   if (hasFastMathFlags())
1312     State.Builder.setFastMathFlags(getFastMathFlags());
1313 
1314   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1315   /// variable on which to base the steps, \p Step is the size of the step.
1316 
1317   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1318   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1319   IRBuilderBase &Builder = State.Builder;
1320 
1321   // Ensure step has the same type as that of scalar IV.
1322   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1323   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1324 
1325   // We build scalar steps for both integer and floating-point induction
1326   // variables. Here, we determine the kind of arithmetic we will perform.
1327   Instruction::BinaryOps AddOp;
1328   Instruction::BinaryOps MulOp;
1329   if (BaseIVTy->isIntegerTy()) {
1330     AddOp = Instruction::Add;
1331     MulOp = Instruction::Mul;
1332   } else {
1333     AddOp = InductionOpcode;
1334     MulOp = Instruction::FMul;
1335   }
1336 
1337   // Determine the number of scalars we need to generate for each unroll
1338   // iteration.
1339   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1340   // Compute the scalar steps and save the results in State.
1341   Type *IntStepTy =
1342       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1343   Type *VecIVTy = nullptr;
1344   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1345   if (!FirstLaneOnly && State.VF.isScalable()) {
1346     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1347     UnitStepVec =
1348         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1349     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1350     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1351   }
1352 
1353   unsigned StartPart = 0;
1354   unsigned EndPart = State.UF;
1355   unsigned StartLane = 0;
1356   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1357   if (State.Instance) {
1358     StartPart = State.Instance->Part;
1359     EndPart = StartPart + 1;
1360     StartLane = State.Instance->Lane.getKnownLane();
1361     EndLane = StartLane + 1;
1362   }
1363   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1364     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1365 
1366     if (!FirstLaneOnly && State.VF.isScalable()) {
1367       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1368       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1369       if (BaseIVTy->isFloatingPointTy())
1370         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1371       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1372       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1373       State.set(this, Add, Part);
1374       // It's useful to record the lane values too for the known minimum number
1375       // of elements so we do those below. This improves the code quality when
1376       // trying to extract the first element, for example.
1377     }
1378 
1379     if (BaseIVTy->isFloatingPointTy())
1380       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1381 
1382     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1383       Value *StartIdx = Builder.CreateBinOp(
1384           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1385       // The step returned by `createStepForVF` is a runtime-evaluated value
1386       // when VF is scalable. Otherwise, it should be folded into a Constant.
1387       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1388              "Expected StartIdx to be folded to a constant when VF is not "
1389              "scalable");
1390       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1391       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1392       State.set(this, Add, VPIteration(Part, Lane));
1393     }
1394   }
1395 }
1396 
1397 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1398 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1399                                   VPSlotTracker &SlotTracker) const {
1400   O << Indent;
1401   printAsOperand(O, SlotTracker);
1402   O << " = SCALAR-STEPS ";
1403   printOperands(O, SlotTracker);
1404 }
1405 #endif
1406 
1407 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1408   assert(State.VF.isVector() && "not widening");
1409   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1410   // Construct a vector GEP by widening the operands of the scalar GEP as
1411   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1412   // results in a vector of pointers when at least one operand of the GEP
1413   // is vector-typed. Thus, to keep the representation compact, we only use
1414   // vector-typed operands for loop-varying values.
1415 
1416   if (areAllOperandsInvariant()) {
1417     // If we are vectorizing, but the GEP has only loop-invariant operands,
1418     // the GEP we build (by only using vector-typed operands for
1419     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1420     // produce a vector of pointers, we need to either arbitrarily pick an
1421     // operand to broadcast, or broadcast a clone of the original GEP.
1422     // Here, we broadcast a clone of the original.
1423     //
1424     // TODO: If at some point we decide to scalarize instructions having
1425     //       loop-invariant operands, this special case will no longer be
1426     //       required. We would add the scalarization decision to
1427     //       collectLoopScalars() and teach getVectorValue() to broadcast
1428     //       the lane-zero scalar value.
1429     SmallVector<Value *> Ops;
1430     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1431       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1432 
1433     auto *NewGEP =
1434         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1435                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1436     for (unsigned Part = 0; Part < State.UF; ++Part) {
1437       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1438       State.set(this, EntryPart, Part);
1439       State.addMetadata(EntryPart, GEP);
1440     }
1441   } else {
1442     // If the GEP has at least one loop-varying operand, we are sure to
1443     // produce a vector of pointers. But if we are only unrolling, we want
1444     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1445     // produce with the code below will be scalar (if VF == 1) or vector
1446     // (otherwise). Note that for the unroll-only case, we still maintain
1447     // values in the vector mapping with initVector, as we do for other
1448     // instructions.
1449     for (unsigned Part = 0; Part < State.UF; ++Part) {
1450       // The pointer operand of the new GEP. If it's loop-invariant, we
1451       // won't broadcast it.
1452       auto *Ptr = isPointerLoopInvariant()
1453                       ? State.get(getOperand(0), VPIteration(0, 0))
1454                       : State.get(getOperand(0), Part);
1455 
1456       // Collect all the indices for the new GEP. If any index is
1457       // loop-invariant, we won't broadcast it.
1458       SmallVector<Value *, 4> Indices;
1459       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1460         VPValue *Operand = getOperand(I);
1461         if (isIndexLoopInvariant(I - 1))
1462           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1463         else
1464           Indices.push_back(State.get(Operand, Part));
1465       }
1466 
1467       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1468       // but it should be a vector, otherwise.
1469       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1470                                              Indices, "", isInBounds());
1471       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1472              "NewGEP is not a pointer vector");
1473       State.set(this, NewGEP, Part);
1474       State.addMetadata(NewGEP, GEP);
1475     }
1476   }
1477 }
1478 
1479 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1481                              VPSlotTracker &SlotTracker) const {
1482   O << Indent << "WIDEN-GEP ";
1483   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1484   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1485     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1486 
1487   O << " ";
1488   printAsOperand(O, SlotTracker);
1489   O << " = getelementptr";
1490   printFlags(O);
1491   printOperands(O, SlotTracker);
1492 }
1493 #endif
1494 
1495 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1496   auto &Builder = State.Builder;
1497   State.setDebugLocFrom(getDebugLoc());
1498   for (unsigned Part = 0; Part < State.UF; ++Part) {
1499     // Calculate the pointer for the specific unroll-part.
1500     Value *PartPtr = nullptr;
1501     // Use i32 for the gep index type when the value is constant,
1502     // or query DataLayout for a more suitable index type otherwise.
1503     const DataLayout &DL =
1504         Builder.GetInsertBlock()->getModule()->getDataLayout();
1505     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1506                         ? DL.getIndexType(IndexedTy->getPointerTo())
1507                         : Builder.getInt32Ty();
1508     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1509     bool InBounds = isInBounds();
1510     if (IsReverse) {
1511       // If the address is consecutive but reversed, then the
1512       // wide store needs to start at the last vector element.
1513       // RunTimeVF =  VScale * VF.getKnownMinValue()
1514       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1515       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1516       // NumElt = -Part * RunTimeVF
1517       Value *NumElt = Builder.CreateMul(
1518           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1519       // LastLane = 1 - RunTimeVF
1520       Value *LastLane =
1521           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1522       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1523       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1524     } else {
1525       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1526       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1527     }
1528 
1529     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1530   }
1531 }
1532 
1533 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1534 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1535                                   VPSlotTracker &SlotTracker) const {
1536   O << Indent;
1537   printAsOperand(O, SlotTracker);
1538   O << " = vector-pointer ";
1539   if (IsReverse)
1540     O << "(reverse) ";
1541 
1542   printOperands(O, SlotTracker);
1543 }
1544 #endif
1545 
1546 void VPBlendRecipe::execute(VPTransformState &State) {
1547   State.setDebugLocFrom(getDebugLoc());
1548   // We know that all PHIs in non-header blocks are converted into
1549   // selects, so we don't have to worry about the insertion order and we
1550   // can just use the builder.
1551   // At this point we generate the predication tree. There may be
1552   // duplications since this is a simple recursive scan, but future
1553   // optimizations will clean it up.
1554 
1555   unsigned NumIncoming = getNumIncomingValues();
1556 
1557   // Generate a sequence of selects of the form:
1558   // SELECT(Mask3, In3,
1559   //        SELECT(Mask2, In2,
1560   //               SELECT(Mask1, In1,
1561   //                      In0)))
1562   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1563   // are essentially undef are taken from In0.
1564  VectorParts Entry(State.UF);
1565  bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1566  for (unsigned In = 0; In < NumIncoming; ++In) {
1567    for (unsigned Part = 0; Part < State.UF; ++Part) {
1568      // We might have single edge PHIs (blocks) - use an identity
1569      // 'select' for the first PHI operand.
1570      Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1571      if (In == 0)
1572        Entry[Part] = In0; // Initialize with the first incoming value.
1573      else {
1574        // Select between the current value and the previous incoming edge
1575        // based on the incoming mask.
1576        Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1577        Entry[Part] =
1578            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1579      }
1580    }
1581  }
1582   for (unsigned Part = 0; Part < State.UF; ++Part)
1583     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1584 }
1585 
1586 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1587 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1588                           VPSlotTracker &SlotTracker) const {
1589   O << Indent << "BLEND ";
1590   printAsOperand(O, SlotTracker);
1591   O << " =";
1592   if (getNumIncomingValues() == 1) {
1593     // Not a User of any mask: not really blending, this is a
1594     // single-predecessor phi.
1595     O << " ";
1596     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1597   } else {
1598     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1599       O << " ";
1600       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1601       if (I == 0)
1602         continue;
1603       O << "/";
1604       getMask(I)->printAsOperand(O, SlotTracker);
1605     }
1606   }
1607 }
1608 #endif
1609 
1610 void VPReductionRecipe::execute(VPTransformState &State) {
1611   assert(!State.Instance && "Reduction being replicated.");
1612   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1613   RecurKind Kind = RdxDesc.getRecurrenceKind();
1614   // Propagate the fast-math flags carried by the underlying instruction.
1615   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1616   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1617   for (unsigned Part = 0; Part < State.UF; ++Part) {
1618     Value *NewVecOp = State.get(getVecOp(), Part);
1619     if (VPValue *Cond = getCondOp()) {
1620       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1621       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1622       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1623       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1624                                                   RdxDesc.getFastMathFlags());
1625       if (State.VF.isVector()) {
1626         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1627       }
1628 
1629       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1630       NewVecOp = Select;
1631     }
1632     Value *NewRed;
1633     Value *NextInChain;
1634     if (IsOrdered) {
1635       if (State.VF.isVector())
1636         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1637                                         PrevInChain);
1638       else
1639         NewRed = State.Builder.CreateBinOp(
1640             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1641             NewVecOp);
1642       PrevInChain = NewRed;
1643     } else {
1644       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1645       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1646     }
1647     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1648       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1649                                    NewRed, PrevInChain);
1650     } else if (IsOrdered)
1651       NextInChain = NewRed;
1652     else
1653       NextInChain = State.Builder.CreateBinOp(
1654           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1655     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1656   }
1657 }
1658 
1659 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1660 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1661                               VPSlotTracker &SlotTracker) const {
1662   O << Indent << "REDUCE ";
1663   printAsOperand(O, SlotTracker);
1664   O << " = ";
1665   getChainOp()->printAsOperand(O, SlotTracker);
1666   O << " +";
1667   if (isa<FPMathOperator>(getUnderlyingInstr()))
1668     O << getUnderlyingInstr()->getFastMathFlags();
1669   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1670   getVecOp()->printAsOperand(O, SlotTracker);
1671   if (getCondOp()) {
1672     O << ", ";
1673     getCondOp()->printAsOperand(O, SlotTracker);
1674   }
1675   O << ")";
1676   if (RdxDesc.IntermediateStore)
1677     O << " (with final reduction value stored in invariant address sank "
1678          "outside of loop)";
1679 }
1680 #endif
1681 
1682 bool VPReplicateRecipe::shouldPack() const {
1683   // Find if the recipe is used by a widened recipe via an intervening
1684   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1685   return any_of(users(), [](const VPUser *U) {
1686     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1687       return any_of(PredR->users(), [PredR](const VPUser *U) {
1688         return !U->usesScalars(PredR);
1689       });
1690     return false;
1691   });
1692 }
1693 
1694 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1695 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1696                               VPSlotTracker &SlotTracker) const {
1697   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1698 
1699   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1700     printAsOperand(O, SlotTracker);
1701     O << " = ";
1702   }
1703   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1704     O << "call";
1705     printFlags(O);
1706     O << "@" << CB->getCalledFunction()->getName() << "(";
1707     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1708                     O, [&O, &SlotTracker](VPValue *Op) {
1709                       Op->printAsOperand(O, SlotTracker);
1710                     });
1711     O << ")";
1712   } else {
1713     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1714     printFlags(O);
1715     printOperands(O, SlotTracker);
1716   }
1717 
1718   if (shouldPack())
1719     O << " (S->V)";
1720 }
1721 #endif
1722 
1723 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1724 /// if it is either defined outside the vector region or its operand is known to
1725 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1726 /// TODO: Uniformity should be associated with a VPValue and there should be a
1727 /// generic way to check.
1728 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1729   return C->isDefinedOutsideVectorRegions() ||
1730          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1731          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1732 }
1733 
1734 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1735   assert(vputils::onlyFirstLaneUsed(this) &&
1736          "Codegen only implemented for first lane.");
1737   switch (Opcode) {
1738   case Instruction::SExt:
1739   case Instruction::ZExt:
1740   case Instruction::Trunc: {
1741     // Note: SExt/ZExt not used yet.
1742     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1743     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1744   }
1745   default:
1746     llvm_unreachable("opcode not implemented yet");
1747   }
1748 }
1749 
1750 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1751   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1752   for (unsigned Part = 0; Part != State.UF; ++Part) {
1753     Value *Res;
1754     // Only generate a single instance, if the recipe is uniform across UFs and
1755     // VFs.
1756     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1757       Res = State.get(this, VPIteration(0, 0));
1758     else
1759       Res = generate(State, Part);
1760     State.set(this, Res, VPIteration(Part, 0));
1761   }
1762 }
1763 
1764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1765 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1766                                 VPSlotTracker &SlotTracker) const {
1767   O << Indent << "SCALAR-CAST ";
1768   printAsOperand(O, SlotTracker);
1769   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1770   printOperands(O, SlotTracker);
1771   O << " to " << *ResultTy;
1772 }
1773 #endif
1774 
1775 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1776   assert(State.Instance && "Branch on Mask works only on single instance.");
1777 
1778   unsigned Part = State.Instance->Part;
1779   unsigned Lane = State.Instance->Lane.getKnownLane();
1780 
1781   Value *ConditionBit = nullptr;
1782   VPValue *BlockInMask = getMask();
1783   if (BlockInMask) {
1784     ConditionBit = State.get(BlockInMask, Part);
1785     if (ConditionBit->getType()->isVectorTy())
1786       ConditionBit = State.Builder.CreateExtractElement(
1787           ConditionBit, State.Builder.getInt32(Lane));
1788   } else // Block in mask is all-one.
1789     ConditionBit = State.Builder.getTrue();
1790 
1791   // Replace the temporary unreachable terminator with a new conditional branch,
1792   // whose two destinations will be set later when they are created.
1793   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1794   assert(isa<UnreachableInst>(CurrentTerminator) &&
1795          "Expected to replace unreachable terminator with conditional branch.");
1796   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1797   CondBr->setSuccessor(0, nullptr);
1798   ReplaceInstWithInst(CurrentTerminator, CondBr);
1799 }
1800 
1801 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1802   assert(State.Instance && "Predicated instruction PHI works per instance.");
1803   Instruction *ScalarPredInst =
1804       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1805   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1806   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1807   assert(PredicatingBB && "Predicated block has no single predecessor.");
1808   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1809          "operand must be VPReplicateRecipe");
1810 
1811   // By current pack/unpack logic we need to generate only a single phi node: if
1812   // a vector value for the predicated instruction exists at this point it means
1813   // the instruction has vector users only, and a phi for the vector value is
1814   // needed. In this case the recipe of the predicated instruction is marked to
1815   // also do that packing, thereby "hoisting" the insert-element sequence.
1816   // Otherwise, a phi node for the scalar value is needed.
1817   unsigned Part = State.Instance->Part;
1818   if (State.hasVectorValue(getOperand(0), Part)) {
1819     Value *VectorValue = State.get(getOperand(0), Part);
1820     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1821     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1822     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1823     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1824     if (State.hasVectorValue(this, Part))
1825       State.reset(this, VPhi, Part);
1826     else
1827       State.set(this, VPhi, Part);
1828     // NOTE: Currently we need to update the value of the operand, so the next
1829     // predicated iteration inserts its generated value in the correct vector.
1830     State.reset(getOperand(0), VPhi, Part);
1831   } else {
1832     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1833     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1834     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1835                      PredicatingBB);
1836     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1837     if (State.hasScalarValue(this, *State.Instance))
1838       State.reset(this, Phi, *State.Instance);
1839     else
1840       State.set(this, Phi, *State.Instance);
1841     // NOTE: Currently we need to update the value of the operand, so the next
1842     // predicated iteration inserts its generated value in the correct vector.
1843     State.reset(getOperand(0), Phi, *State.Instance);
1844   }
1845 }
1846 
1847 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1848 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1849                                 VPSlotTracker &SlotTracker) const {
1850   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1851   printAsOperand(O, SlotTracker);
1852   O << " = ";
1853   printOperands(O, SlotTracker);
1854 }
1855 
1856 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1857                               VPSlotTracker &SlotTracker) const {
1858   O << Indent << "WIDEN ";
1859   printAsOperand(O, SlotTracker);
1860   O << " = load ";
1861   printOperands(O, SlotTracker);
1862 }
1863 
1864 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1865                                  VPSlotTracker &SlotTracker) const {
1866   O << Indent << "WIDEN ";
1867   printAsOperand(O, SlotTracker);
1868   O << " = vp.load ";
1869   printOperands(O, SlotTracker);
1870 }
1871 
1872 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1873                                VPSlotTracker &SlotTracker) const {
1874   O << Indent << "WIDEN store ";
1875   printOperands(O, SlotTracker);
1876 }
1877 
1878 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1879                                   VPSlotTracker &SlotTracker) const {
1880   O << Indent << "WIDEN vp.store ";
1881   printOperands(O, SlotTracker);
1882 }
1883 #endif
1884 
1885 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1886   Value *Start = getStartValue()->getLiveInIRValue();
1887   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1888   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1889 
1890   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1891   EntryPart->addIncoming(Start, VectorPH);
1892   EntryPart->setDebugLoc(getDebugLoc());
1893   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1894     State.set(this, EntryPart, Part, /*IsScalar*/ true);
1895 }
1896 
1897 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1898 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1899                                    VPSlotTracker &SlotTracker) const {
1900   O << Indent << "EMIT ";
1901   printAsOperand(O, SlotTracker);
1902   O << " = CANONICAL-INDUCTION ";
1903   printOperands(O, SlotTracker);
1904 }
1905 #endif
1906 
1907 bool VPCanonicalIVPHIRecipe::isCanonical(
1908     InductionDescriptor::InductionKind Kind, VPValue *Start,
1909     VPValue *Step) const {
1910   // Must be an integer induction.
1911   if (Kind != InductionDescriptor::IK_IntInduction)
1912     return false;
1913   // Start must match the start value of this canonical induction.
1914   if (Start != getStartValue())
1915     return false;
1916 
1917   // If the step is defined by a recipe, it is not a ConstantInt.
1918   if (Step->getDefiningRecipe())
1919     return false;
1920 
1921   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1922   return StepC && StepC->isOne();
1923 }
1924 
1925 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1926   return IsScalarAfterVectorization &&
1927          (!IsScalable || vputils::onlyFirstLaneUsed(this));
1928 }
1929 
1930 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1931 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1932                                           VPSlotTracker &SlotTracker) const {
1933   O << Indent << "EMIT ";
1934   printAsOperand(O, SlotTracker);
1935   O << " = WIDEN-POINTER-INDUCTION ";
1936   getStartValue()->printAsOperand(O, SlotTracker);
1937   O << ", " << *IndDesc.getStep();
1938 }
1939 #endif
1940 
1941 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1942   assert(!State.Instance && "cannot be used in per-lane");
1943   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1944   SCEVExpander Exp(SE, DL, "induction");
1945 
1946   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1947                                  &*State.Builder.GetInsertPoint());
1948   assert(!State.ExpandedSCEVs.contains(Expr) &&
1949          "Same SCEV expanded multiple times");
1950   State.ExpandedSCEVs[Expr] = Res;
1951   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1952     State.set(this, Res, {Part, 0});
1953 }
1954 
1955 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1956 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1957                                VPSlotTracker &SlotTracker) const {
1958   O << Indent << "EMIT ";
1959   getVPSingleValue()->printAsOperand(O, SlotTracker);
1960   O << " = EXPAND SCEV " << *Expr;
1961 }
1962 #endif
1963 
1964 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1965   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
1966   Type *STy = CanonicalIV->getType();
1967   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1968   ElementCount VF = State.VF;
1969   Value *VStart = VF.isScalar()
1970                       ? CanonicalIV
1971                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1972   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1973     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1974     if (VF.isVector()) {
1975       VStep = Builder.CreateVectorSplat(VF, VStep);
1976       VStep =
1977           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1978     }
1979     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1980     State.set(this, CanonicalVectorIV, Part);
1981   }
1982 }
1983 
1984 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1985 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1986                                      VPSlotTracker &SlotTracker) const {
1987   O << Indent << "EMIT ";
1988   printAsOperand(O, SlotTracker);
1989   O << " = WIDEN-CANONICAL-INDUCTION ";
1990   printOperands(O, SlotTracker);
1991 }
1992 #endif
1993 
1994 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1995   auto &Builder = State.Builder;
1996   // Create a vector from the initial value.
1997   auto *VectorInit = getStartValue()->getLiveInIRValue();
1998 
1999   Type *VecTy = State.VF.isScalar()
2000                     ? VectorInit->getType()
2001                     : VectorType::get(VectorInit->getType(), State.VF);
2002 
2003   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2004   if (State.VF.isVector()) {
2005     auto *IdxTy = Builder.getInt32Ty();
2006     auto *One = ConstantInt::get(IdxTy, 1);
2007     IRBuilder<>::InsertPointGuard Guard(Builder);
2008     Builder.SetInsertPoint(VectorPH->getTerminator());
2009     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2010     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2011     VectorInit = Builder.CreateInsertElement(
2012         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2013   }
2014 
2015   // Create a phi node for the new recurrence.
2016   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
2017   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2018   EntryPart->addIncoming(VectorInit, VectorPH);
2019   State.set(this, EntryPart, 0);
2020 }
2021 
2022 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2023 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2024                                             VPSlotTracker &SlotTracker) const {
2025   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2026   printAsOperand(O, SlotTracker);
2027   O << " = phi ";
2028   printOperands(O, SlotTracker);
2029 }
2030 #endif
2031 
2032 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2033   auto &Builder = State.Builder;
2034 
2035   // Reductions do not have to start at zero. They can start with
2036   // any loop invariant values.
2037   VPValue *StartVPV = getStartValue();
2038   Value *StartV = StartVPV->getLiveInIRValue();
2039 
2040   // In order to support recurrences we need to be able to vectorize Phi nodes.
2041   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2042   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2043   // this value when we vectorize all of the instructions that use the PHI.
2044   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2045   Type *VecTy = ScalarPHI ? StartV->getType()
2046                           : VectorType::get(StartV->getType(), State.VF);
2047 
2048   BasicBlock *HeaderBB = State.CFG.PrevBB;
2049   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2050          "recipe must be in the vector loop header");
2051   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2052   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2053     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2054     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2055     State.set(this, EntryPart, Part, IsInLoop);
2056   }
2057 
2058   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2059 
2060   Value *Iden = nullptr;
2061   RecurKind RK = RdxDesc.getRecurrenceKind();
2062   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2063       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2064     // MinMax and AnyOf reductions have the start value as their identity.
2065     if (ScalarPHI) {
2066       Iden = StartV;
2067     } else {
2068       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2069       Builder.SetInsertPoint(VectorPH->getTerminator());
2070       StartV = Iden =
2071           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2072     }
2073   } else {
2074     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2075                                          RdxDesc.getFastMathFlags());
2076 
2077     if (!ScalarPHI) {
2078       Iden = Builder.CreateVectorSplat(State.VF, Iden);
2079       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2080       Builder.SetInsertPoint(VectorPH->getTerminator());
2081       Constant *Zero = Builder.getInt32(0);
2082       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2083     }
2084   }
2085 
2086   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2087     Value *EntryPart = State.get(this, Part, IsInLoop);
2088     // Make sure to add the reduction start value only to the
2089     // first unroll part.
2090     Value *StartVal = (Part == 0) ? StartV : Iden;
2091     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2092   }
2093 }
2094 
2095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2096 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2097                                  VPSlotTracker &SlotTracker) const {
2098   O << Indent << "WIDEN-REDUCTION-PHI ";
2099 
2100   printAsOperand(O, SlotTracker);
2101   O << " = phi ";
2102   printOperands(O, SlotTracker);
2103 }
2104 #endif
2105 
2106 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2107   assert(EnableVPlanNativePath &&
2108          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2109 
2110   Value *Op0 = State.get(getOperand(0), 0);
2111   Type *VecTy = Op0->getType();
2112   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2113   State.set(this, VecPhi, 0);
2114 }
2115 
2116 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2117 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2118                              VPSlotTracker &SlotTracker) const {
2119   O << Indent << "WIDEN-PHI ";
2120 
2121   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2122   // Unless all incoming values are modeled in VPlan  print the original PHI
2123   // directly.
2124   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2125   // values as VPValues.
2126   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2127     O << VPlanIngredient(OriginalPhi);
2128     return;
2129   }
2130 
2131   printAsOperand(O, SlotTracker);
2132   O << " = phi ";
2133   printOperands(O, SlotTracker);
2134 }
2135 #endif
2136 
2137 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2138 // remove VPActiveLaneMaskPHIRecipe.
2139 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2140   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2141   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2142     Value *StartMask = State.get(getOperand(0), Part);
2143     PHINode *EntryPart =
2144         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2145     EntryPart->addIncoming(StartMask, VectorPH);
2146     EntryPart->setDebugLoc(getDebugLoc());
2147     State.set(this, EntryPart, Part);
2148   }
2149 }
2150 
2151 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2152 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2153                                       VPSlotTracker &SlotTracker) const {
2154   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2155 
2156   printAsOperand(O, SlotTracker);
2157   O << " = phi ";
2158   printOperands(O, SlotTracker);
2159 }
2160 #endif
2161 
2162 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2163   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2164   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2165   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2166   PHINode *EntryPart =
2167       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2168   EntryPart->addIncoming(Start, VectorPH);
2169   EntryPart->setDebugLoc(getDebugLoc());
2170   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2171 }
2172 
2173 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2174 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2175                                   VPSlotTracker &SlotTracker) const {
2176   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2177 
2178   printAsOperand(O, SlotTracker);
2179   O << " = phi ";
2180   printOperands(O, SlotTracker);
2181 }
2182 #endif
2183