xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision b841e2eca3b5c8b408214a46593f6a025e0fe48b)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 extern cl::opt<unsigned> ForceTargetInstructionCost;
43 
44 #define LV_NAME "loop-vectorize"
45 #define DEBUG_TYPE LV_NAME
46 
47 bool VPRecipeBase::mayWriteToMemory() const {
48   switch (getVPDefID()) {
49   case VPInterleaveSC:
50     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
51   case VPWidenStoreEVLSC:
52   case VPWidenStoreSC:
53     return true;
54   case VPReplicateSC:
55     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56         ->mayWriteToMemory();
57   case VPWidenCallSC:
58     return !cast<VPWidenCallRecipe>(this)
59                 ->getCalledScalarFunction()
60                 ->onlyReadsMemory();
61   case VPBranchOnMaskSC:
62   case VPScalarIVStepsSC:
63   case VPPredInstPHISC:
64     return false;
65   case VPBlendSC:
66   case VPReductionSC:
67   case VPWidenCanonicalIVSC:
68   case VPWidenCastSC:
69   case VPWidenGEPSC:
70   case VPWidenIntOrFpInductionSC:
71   case VPWidenLoadEVLSC:
72   case VPWidenLoadSC:
73   case VPWidenPHISC:
74   case VPWidenSC:
75   case VPWidenSelectSC: {
76     const Instruction *I =
77         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
78     (void)I;
79     assert((!I || !I->mayWriteToMemory()) &&
80            "underlying instruction may write to memory");
81     return false;
82   }
83   default:
84     return true;
85   }
86 }
87 
88 bool VPRecipeBase::mayReadFromMemory() const {
89   switch (getVPDefID()) {
90   case VPWidenLoadEVLSC:
91   case VPWidenLoadSC:
92     return true;
93   case VPReplicateSC:
94     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
95         ->mayReadFromMemory();
96   case VPWidenCallSC:
97     return !cast<VPWidenCallRecipe>(this)
98                 ->getCalledScalarFunction()
99                 ->onlyWritesMemory();
100   case VPBranchOnMaskSC:
101   case VPPredInstPHISC:
102   case VPScalarIVStepsSC:
103   case VPWidenStoreEVLSC:
104   case VPWidenStoreSC:
105     return false;
106   case VPBlendSC:
107   case VPReductionSC:
108   case VPWidenCanonicalIVSC:
109   case VPWidenCastSC:
110   case VPWidenGEPSC:
111   case VPWidenIntOrFpInductionSC:
112   case VPWidenPHISC:
113   case VPWidenSC:
114   case VPWidenSelectSC: {
115     const Instruction *I =
116         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
117     (void)I;
118     assert((!I || !I->mayReadFromMemory()) &&
119            "underlying instruction may read from memory");
120     return false;
121   }
122   default:
123     return true;
124   }
125 }
126 
127 bool VPRecipeBase::mayHaveSideEffects() const {
128   switch (getVPDefID()) {
129   case VPDerivedIVSC:
130   case VPPredInstPHISC:
131   case VPScalarCastSC:
132     return false;
133   case VPInstructionSC:
134     switch (cast<VPInstruction>(this)->getOpcode()) {
135     case Instruction::Or:
136     case Instruction::ICmp:
137     case Instruction::Select:
138     case VPInstruction::Not:
139     case VPInstruction::CalculateTripCountMinusVF:
140     case VPInstruction::CanonicalIVIncrementForPart:
141     case VPInstruction::ExtractFromEnd:
142     case VPInstruction::FirstOrderRecurrenceSplice:
143     case VPInstruction::LogicalAnd:
144     case VPInstruction::PtrAdd:
145       return false;
146     default:
147       return true;
148     }
149   case VPWidenCallSC: {
150     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
151     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
152   }
153   case VPBlendSC:
154   case VPReductionSC:
155   case VPScalarIVStepsSC:
156   case VPWidenCanonicalIVSC:
157   case VPWidenCastSC:
158   case VPWidenGEPSC:
159   case VPWidenIntOrFpInductionSC:
160   case VPWidenPHISC:
161   case VPWidenPointerInductionSC:
162   case VPWidenSC:
163   case VPWidenSelectSC: {
164     const Instruction *I =
165         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
166     (void)I;
167     assert((!I || !I->mayHaveSideEffects()) &&
168            "underlying instruction has side-effects");
169     return false;
170   }
171   case VPInterleaveSC:
172     return mayWriteToMemory();
173   case VPWidenLoadEVLSC:
174   case VPWidenLoadSC:
175   case VPWidenStoreEVLSC:
176   case VPWidenStoreSC:
177     assert(
178         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
179             mayWriteToMemory() &&
180         "mayHaveSideffects result for ingredient differs from this "
181         "implementation");
182     return mayWriteToMemory();
183   case VPReplicateSC: {
184     auto *R = cast<VPReplicateRecipe>(this);
185     return R->getUnderlyingInstr()->mayHaveSideEffects();
186   }
187   default:
188     return true;
189   }
190 }
191 
192 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
193   VPValue *ExitValue = getOperand(0);
194   auto Lane = vputils::isUniformAfterVectorization(ExitValue)
195                   ? VPLane::getFirstLane()
196                   : VPLane::getLastLaneForVF(State.VF);
197   VPBasicBlock *MiddleVPBB =
198       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
199   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
200   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
201                    MiddleBB);
202 }
203 
204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
205 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
206   O << "Live-out ";
207   getPhi()->printAsOperand(O);
208   O << " = ";
209   getOperand(0)->printAsOperand(O, SlotTracker);
210   O << "\n";
211 }
212 #endif
213 
214 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
215   assert(!Parent && "Recipe already in some VPBasicBlock");
216   assert(InsertPos->getParent() &&
217          "Insertion position not in any VPBasicBlock");
218   InsertPos->getParent()->insert(this, InsertPos->getIterator());
219 }
220 
221 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
222                                 iplist<VPRecipeBase>::iterator I) {
223   assert(!Parent && "Recipe already in some VPBasicBlock");
224   assert(I == BB.end() || I->getParent() == &BB);
225   BB.insert(this, I);
226 }
227 
228 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
229   assert(!Parent && "Recipe already in some VPBasicBlock");
230   assert(InsertPos->getParent() &&
231          "Insertion position not in any VPBasicBlock");
232   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
233 }
234 
235 void VPRecipeBase::removeFromParent() {
236   assert(getParent() && "Recipe not in any VPBasicBlock");
237   getParent()->getRecipeList().remove(getIterator());
238   Parent = nullptr;
239 }
240 
241 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
242   assert(getParent() && "Recipe not in any VPBasicBlock");
243   return getParent()->getRecipeList().erase(getIterator());
244 }
245 
246 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
247   removeFromParent();
248   insertAfter(InsertPos);
249 }
250 
251 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
252                               iplist<VPRecipeBase>::iterator I) {
253   removeFromParent();
254   insertBefore(BB, I);
255 }
256 
257 /// Return the underlying instruction to be used for computing \p R's cost via
258 /// the legacy cost model. Return nullptr if there's no suitable instruction.
259 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
260   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
261     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
262   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
263     return IG->getInsertPos();
264   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
265     return &WidenMem->getIngredient();
266   return nullptr;
267 }
268 
269 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
270   if (auto *UI = getInstructionForCost(this))
271     if (Ctx.skipCostComputation(UI, VF.isVector()))
272       return 0;
273 
274   InstructionCost RecipeCost = computeCost(VF, Ctx);
275   if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
276       RecipeCost.isValid())
277     RecipeCost = InstructionCost(ForceTargetInstructionCost);
278 
279   LLVM_DEBUG({
280     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
281     dump();
282   });
283   return RecipeCost;
284 }
285 
286 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
287                                           VPCostContext &Ctx) const {
288   // Compute the cost for the recipe falling back to the legacy cost model using
289   // the underlying instruction. If there is no underlying instruction, returns
290   // 0.
291   Instruction *UI = getInstructionForCost(this);
292   if (UI && isa<VPReplicateRecipe>(this)) {
293     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
294     // transform, avoid computing their cost multiple times for now.
295     Ctx.SkipCostComputation.insert(UI);
296   }
297   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
298 }
299 
300 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
301   assert(OpType == OperationType::FPMathOp &&
302          "recipe doesn't have fast math flags");
303   FastMathFlags Res;
304   Res.setAllowReassoc(FMFs.AllowReassoc);
305   Res.setNoNaNs(FMFs.NoNaNs);
306   Res.setNoInfs(FMFs.NoInfs);
307   Res.setNoSignedZeros(FMFs.NoSignedZeros);
308   Res.setAllowReciprocal(FMFs.AllowReciprocal);
309   Res.setAllowContract(FMFs.AllowContract);
310   Res.setApproxFunc(FMFs.ApproxFunc);
311   return Res;
312 }
313 
314 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
315                              VPValue *A, VPValue *B, DebugLoc DL,
316                              const Twine &Name)
317     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
318                           Pred, DL),
319       Opcode(Opcode), Name(Name.str()) {
320   assert(Opcode == Instruction::ICmp &&
321          "only ICmp predicates supported at the moment");
322 }
323 
324 VPInstruction::VPInstruction(unsigned Opcode,
325                              std::initializer_list<VPValue *> Operands,
326                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
327     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
328       Opcode(Opcode), Name(Name.str()) {
329   // Make sure the VPInstruction is a floating-point operation.
330   assert(isFPMathOp() && "this op can't take fast-math flags");
331 }
332 
333 bool VPInstruction::doesGeneratePerAllLanes() const {
334   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
335 }
336 
337 bool VPInstruction::canGenerateScalarForFirstLane() const {
338   if (Instruction::isBinaryOp(getOpcode()))
339     return true;
340   if (isVectorToScalar())
341     return true;
342   switch (Opcode) {
343   case Instruction::ICmp:
344   case VPInstruction::BranchOnCond:
345   case VPInstruction::BranchOnCount:
346   case VPInstruction::CalculateTripCountMinusVF:
347   case VPInstruction::CanonicalIVIncrementForPart:
348   case VPInstruction::PtrAdd:
349   case VPInstruction::ExplicitVectorLength:
350     return true;
351   default:
352     return false;
353   }
354 }
355 
356 Value *VPInstruction::generatePerLane(VPTransformState &State,
357                                       const VPIteration &Lane) {
358   IRBuilderBase &Builder = State.Builder;
359 
360   assert(getOpcode() == VPInstruction::PtrAdd &&
361          "only PtrAdd opcodes are supported for now");
362   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
363                               State.get(getOperand(1), Lane), Name);
364 }
365 
366 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
367   IRBuilderBase &Builder = State.Builder;
368 
369   if (Instruction::isBinaryOp(getOpcode())) {
370     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
371     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
372     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
373     auto *Res =
374         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
375     if (auto *I = dyn_cast<Instruction>(Res))
376       setFlags(I);
377     return Res;
378   }
379 
380   switch (getOpcode()) {
381   case VPInstruction::Not: {
382     Value *A = State.get(getOperand(0), Part);
383     return Builder.CreateNot(A, Name);
384   }
385   case Instruction::ICmp: {
386     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
387     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
388     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
389     return Builder.CreateCmp(getPredicate(), A, B, Name);
390   }
391   case Instruction::Select: {
392     Value *Cond = State.get(getOperand(0), Part);
393     Value *Op1 = State.get(getOperand(1), Part);
394     Value *Op2 = State.get(getOperand(2), Part);
395     return Builder.CreateSelect(Cond, Op1, Op2, Name);
396   }
397   case VPInstruction::ActiveLaneMask: {
398     // Get first lane of vector induction variable.
399     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
400     // Get the original loop tripcount.
401     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
402 
403     // If this part of the active lane mask is scalar, generate the CMP directly
404     // to avoid unnecessary extracts.
405     if (State.VF.isScalar())
406       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
407                                Name);
408 
409     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
410     auto *PredTy = VectorType::get(Int1Ty, State.VF);
411     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
412                                    {PredTy, ScalarTC->getType()},
413                                    {VIVElem0, ScalarTC}, nullptr, Name);
414   }
415   case VPInstruction::FirstOrderRecurrenceSplice: {
416     // Generate code to combine the previous and current values in vector v3.
417     //
418     //   vector.ph:
419     //     v_init = vector(..., ..., ..., a[-1])
420     //     br vector.body
421     //
422     //   vector.body
423     //     i = phi [0, vector.ph], [i+4, vector.body]
424     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
425     //     v2 = a[i, i+1, i+2, i+3];
426     //     v3 = vector(v1(3), v2(0, 1, 2))
427 
428     // For the first part, use the recurrence phi (v1), otherwise v2.
429     auto *V1 = State.get(getOperand(0), 0);
430     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
431     if (!PartMinus1->getType()->isVectorTy())
432       return PartMinus1;
433     Value *V2 = State.get(getOperand(1), Part);
434     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
435   }
436   case VPInstruction::CalculateTripCountMinusVF: {
437     if (Part != 0)
438       return State.get(this, 0, /*IsScalar*/ true);
439 
440     Value *ScalarTC = State.get(getOperand(0), {0, 0});
441     Value *Step =
442         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
443     Value *Sub = Builder.CreateSub(ScalarTC, Step);
444     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
445     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
446     return Builder.CreateSelect(Cmp, Sub, Zero);
447   }
448   case VPInstruction::ExplicitVectorLength: {
449     // Compute EVL
450     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
451       assert(AVL->getType()->isIntegerTy() &&
452              "Requested vector length should be an integer.");
453 
454       // TODO: Add support for MaxSafeDist for correct loop emission.
455       assert(State.VF.isScalable() && "Expected scalable vector factor.");
456       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
457 
458       Value *EVL = State.Builder.CreateIntrinsic(
459           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
460           {AVL, VFArg, State.Builder.getTrue()});
461       return EVL;
462     };
463     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
464     // be outside of the main loop.
465     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
466     // Compute VTC - IV as the AVL (requested vector length).
467     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
468     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
469     Value *AVL = State.Builder.CreateSub(TripCount, Index);
470     Value *EVL = GetEVL(State, AVL);
471     return EVL;
472   }
473   case VPInstruction::CanonicalIVIncrementForPart: {
474     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
475     if (Part == 0)
476       return IV;
477 
478     // The canonical IV is incremented by the vectorization factor (num of SIMD
479     // elements) times the unroll part.
480     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
481     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
482                              hasNoSignedWrap());
483   }
484   case VPInstruction::BranchOnCond: {
485     if (Part != 0)
486       return nullptr;
487 
488     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
489     // Replace the temporary unreachable terminator with a new conditional
490     // branch, hooking it up to backward destination for exiting blocks now and
491     // to forward destination(s) later when they are created.
492     BranchInst *CondBr =
493         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
494     CondBr->setSuccessor(0, nullptr);
495     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
496 
497     if (!getParent()->isExiting())
498       return CondBr;
499 
500     VPRegionBlock *ParentRegion = getParent()->getParent();
501     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
502     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
503     return CondBr;
504   }
505   case VPInstruction::BranchOnCount: {
506     if (Part != 0)
507       return nullptr;
508     // First create the compare.
509     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
510     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
511     Value *Cond = Builder.CreateICmpEQ(IV, TC);
512 
513     // Now create the branch.
514     auto *Plan = getParent()->getPlan();
515     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
516     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
517 
518     // Replace the temporary unreachable terminator with a new conditional
519     // branch, hooking it up to backward destination (the header) now and to the
520     // forward destination (the exit/middle block) later when it is created.
521     // Note that CreateCondBr expects a valid BB as first argument, so we need
522     // to set it to nullptr later.
523     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
524                                               State.CFG.VPBB2IRBB[Header]);
525     CondBr->setSuccessor(0, nullptr);
526     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
527     return CondBr;
528   }
529   case VPInstruction::ComputeReductionResult: {
530     if (Part != 0)
531       return State.get(this, 0, /*IsScalar*/ true);
532 
533     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
534     // and will be removed by breaking up the recipe further.
535     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
536     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
537     // Get its reduction variable descriptor.
538     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
539 
540     RecurKind RK = RdxDesc.getRecurrenceKind();
541 
542     VPValue *LoopExitingDef = getOperand(1);
543     Type *PhiTy = OrigPhi->getType();
544     VectorParts RdxParts(State.UF);
545     for (unsigned Part = 0; Part < State.UF; ++Part)
546       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
547 
548     // If the vector reduction can be performed in a smaller type, we truncate
549     // then extend the loop exit value to enable InstCombine to evaluate the
550     // entire expression in the smaller type.
551     // TODO: Handle this in truncateToMinBW.
552     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
553       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
554       for (unsigned Part = 0; Part < State.UF; ++Part)
555         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
556     }
557     // Reduce all of the unrolled parts into a single vector.
558     Value *ReducedPartRdx = RdxParts[0];
559     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
560     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
561       Op = Instruction::Or;
562 
563     if (PhiR->isOrdered()) {
564       ReducedPartRdx = RdxParts[State.UF - 1];
565     } else {
566       // Floating-point operations should have some FMF to enable the reduction.
567       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
568       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
569       for (unsigned Part = 1; Part < State.UF; ++Part) {
570         Value *RdxPart = RdxParts[Part];
571         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
572           ReducedPartRdx = Builder.CreateBinOp(
573               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
574         else
575           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
576       }
577     }
578 
579     // Create the reduction after the loop. Note that inloop reductions create
580     // the target reduction in the loop using a Reduction recipe.
581     if ((State.VF.isVector() ||
582          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
583         !PhiR->isInLoop()) {
584       ReducedPartRdx =
585           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
586       // If the reduction can be performed in a smaller type, we need to extend
587       // the reduction to the wider type before we branch to the original loop.
588       if (PhiTy != RdxDesc.getRecurrenceType())
589         ReducedPartRdx = RdxDesc.isSigned()
590                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
591                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
592     }
593 
594     // If there were stores of the reduction value to a uniform memory address
595     // inside the loop, create the final store here.
596     if (StoreInst *SI = RdxDesc.IntermediateStore) {
597       auto *NewSI = Builder.CreateAlignedStore(
598           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
599       propagateMetadata(NewSI, SI);
600     }
601 
602     return ReducedPartRdx;
603   }
604   case VPInstruction::ExtractFromEnd: {
605     if (Part != 0)
606       return State.get(this, 0, /*IsScalar*/ true);
607 
608     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
609     unsigned Offset = CI->getZExtValue();
610     assert(Offset > 0 && "Offset from end must be positive");
611     Value *Res;
612     if (State.VF.isVector()) {
613       assert(Offset <= State.VF.getKnownMinValue() &&
614              "invalid offset to extract from");
615       // Extract lane VF - Offset from the operand.
616       Res = State.get(
617           getOperand(0),
618           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
619     } else {
620       assert(Offset <= State.UF && "invalid offset to extract from");
621       // When loop is unrolled without vectorizing, retrieve UF - Offset.
622       Res = State.get(getOperand(0), State.UF - Offset);
623     }
624     if (isa<ExtractElementInst>(Res))
625       Res->setName(Name);
626     return Res;
627   }
628   case VPInstruction::LogicalAnd: {
629     Value *A = State.get(getOperand(0), Part);
630     Value *B = State.get(getOperand(1), Part);
631     return Builder.CreateLogicalAnd(A, B, Name);
632   }
633   case VPInstruction::PtrAdd: {
634     assert(vputils::onlyFirstLaneUsed(this) &&
635            "can only generate first lane for PtrAdd");
636     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
637     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
638     return Builder.CreatePtrAdd(Ptr, Addend, Name);
639   }
640   default:
641     llvm_unreachable("Unsupported opcode for instruction");
642   }
643 }
644 
645 bool VPInstruction::isVectorToScalar() const {
646   return getOpcode() == VPInstruction::ExtractFromEnd ||
647          getOpcode() == VPInstruction::ComputeReductionResult;
648 }
649 
650 #if !defined(NDEBUG)
651 bool VPInstruction::isFPMathOp() const {
652   // Inspired by FPMathOperator::classof. Notable differences are that we don't
653   // support Call, PHI and Select opcodes here yet.
654   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
655          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
656          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
657          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
658 }
659 #endif
660 
661 void VPInstruction::execute(VPTransformState &State) {
662   assert(!State.Instance && "VPInstruction executing an Instance");
663   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
664   assert((hasFastMathFlags() == isFPMathOp() ||
665           getOpcode() == Instruction::Select) &&
666          "Recipe not a FPMathOp but has fast-math flags?");
667   if (hasFastMathFlags())
668     State.Builder.setFastMathFlags(getFastMathFlags());
669   State.setDebugLocFrom(getDebugLoc());
670   bool GeneratesPerFirstLaneOnly =
671       canGenerateScalarForFirstLane() &&
672       (vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
673   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
674   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
675   for (unsigned Part = 0; Part < State.UF; ++Part) {
676     if (GeneratesPerAllLanes) {
677       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
678            Lane != NumLanes; ++Lane) {
679         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
680         assert(GeneratedValue && "generatePerLane must produce a value");
681         State.set(this, GeneratedValue, VPIteration(Part, Lane));
682       }
683       continue;
684     }
685 
686     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
687       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
688       State.set(this, Part0, Part,
689                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
690       continue;
691     }
692 
693     Value *GeneratedValue = generatePerPart(State, Part);
694     if (!hasResult())
695       continue;
696     assert(GeneratedValue && "generatePerPart must produce a value");
697     assert((GeneratedValue->getType()->isVectorTy() ==
698                 !GeneratesPerFirstLaneOnly ||
699             State.VF.isScalar()) &&
700            "scalar value but not only first lane defined");
701     State.set(this, GeneratedValue, Part,
702               /*IsScalar*/ GeneratesPerFirstLaneOnly);
703   }
704 }
705 
706 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
707   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
708   if (Instruction::isBinaryOp(getOpcode()))
709     return vputils::onlyFirstLaneUsed(this);
710 
711   switch (getOpcode()) {
712   default:
713     return false;
714   case Instruction::ICmp:
715   case VPInstruction::PtrAdd:
716     // TODO: Cover additional opcodes.
717     return vputils::onlyFirstLaneUsed(this);
718   case VPInstruction::ActiveLaneMask:
719   case VPInstruction::ExplicitVectorLength:
720   case VPInstruction::CalculateTripCountMinusVF:
721   case VPInstruction::CanonicalIVIncrementForPart:
722   case VPInstruction::BranchOnCount:
723   case VPInstruction::BranchOnCond:
724     return true;
725   };
726   llvm_unreachable("switch should return");
727 }
728 
729 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
730   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
731   if (Instruction::isBinaryOp(getOpcode()))
732     return vputils::onlyFirstPartUsed(this);
733 
734   switch (getOpcode()) {
735   default:
736     return false;
737   case Instruction::ICmp:
738   case Instruction::Select:
739     return vputils::onlyFirstPartUsed(this);
740   case VPInstruction::BranchOnCount:
741   case VPInstruction::BranchOnCond:
742   case VPInstruction::CanonicalIVIncrementForPart:
743     return true;
744   };
745   llvm_unreachable("switch should return");
746 }
747 
748 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
749 void VPInstruction::dump() const {
750   VPSlotTracker SlotTracker(getParent()->getPlan());
751   print(dbgs(), "", SlotTracker);
752 }
753 
754 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
755                           VPSlotTracker &SlotTracker) const {
756   O << Indent << "EMIT ";
757 
758   if (hasResult()) {
759     printAsOperand(O, SlotTracker);
760     O << " = ";
761   }
762 
763   switch (getOpcode()) {
764   case VPInstruction::Not:
765     O << "not";
766     break;
767   case VPInstruction::SLPLoad:
768     O << "combined load";
769     break;
770   case VPInstruction::SLPStore:
771     O << "combined store";
772     break;
773   case VPInstruction::ActiveLaneMask:
774     O << "active lane mask";
775     break;
776   case VPInstruction::ExplicitVectorLength:
777     O << "EXPLICIT-VECTOR-LENGTH";
778     break;
779   case VPInstruction::FirstOrderRecurrenceSplice:
780     O << "first-order splice";
781     break;
782   case VPInstruction::BranchOnCond:
783     O << "branch-on-cond";
784     break;
785   case VPInstruction::CalculateTripCountMinusVF:
786     O << "TC > VF ? TC - VF : 0";
787     break;
788   case VPInstruction::CanonicalIVIncrementForPart:
789     O << "VF * Part +";
790     break;
791   case VPInstruction::BranchOnCount:
792     O << "branch-on-count";
793     break;
794   case VPInstruction::ExtractFromEnd:
795     O << "extract-from-end";
796     break;
797   case VPInstruction::ComputeReductionResult:
798     O << "compute-reduction-result";
799     break;
800   case VPInstruction::LogicalAnd:
801     O << "logical-and";
802     break;
803   case VPInstruction::PtrAdd:
804     O << "ptradd";
805     break;
806   default:
807     O << Instruction::getOpcodeName(getOpcode());
808   }
809 
810   printFlags(O);
811   printOperands(O, SlotTracker);
812 
813   if (auto DL = getDebugLoc()) {
814     O << ", !dbg ";
815     DL.print(O);
816   }
817 }
818 #endif
819 
820 void VPWidenCallRecipe::execute(VPTransformState &State) {
821   assert(State.VF.isVector() && "not widening");
822   Function *CalledScalarFn = getCalledScalarFunction();
823   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
824          "DbgInfoIntrinsic should have been dropped during VPlan construction");
825   State.setDebugLocFrom(getDebugLoc());
826 
827   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
828   FunctionType *VFTy = nullptr;
829   if (Variant)
830     VFTy = Variant->getFunctionType();
831   for (unsigned Part = 0; Part < State.UF; ++Part) {
832     SmallVector<Type *, 2> TysForDecl;
833     // Add return type if intrinsic is overloaded on it.
834     if (UseIntrinsic &&
835         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
836       TysForDecl.push_back(VectorType::get(
837           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
838     SmallVector<Value *, 4> Args;
839     for (const auto &I : enumerate(arg_operands())) {
840       // Some intrinsics have a scalar argument - don't replace it with a
841       // vector.
842       Value *Arg;
843       if (UseIntrinsic &&
844           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
845         Arg = State.get(I.value(), VPIteration(0, 0));
846       // Some vectorized function variants may also take a scalar argument,
847       // e.g. linear parameters for pointers. This needs to be the scalar value
848       // from the start of the respective part when interleaving.
849       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
850         Arg = State.get(I.value(), VPIteration(Part, 0));
851       else
852         Arg = State.get(I.value(), Part);
853       if (UseIntrinsic &&
854           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
855         TysForDecl.push_back(Arg->getType());
856       Args.push_back(Arg);
857     }
858 
859     Function *VectorF;
860     if (UseIntrinsic) {
861       // Use vector version of the intrinsic.
862       Module *M = State.Builder.GetInsertBlock()->getModule();
863       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
864       assert(VectorF && "Can't retrieve vector intrinsic.");
865     } else {
866 #ifndef NDEBUG
867       assert(Variant != nullptr && "Can't create vector function.");
868 #endif
869       VectorF = Variant;
870     }
871 
872     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
873     SmallVector<OperandBundleDef, 1> OpBundles;
874     if (CI)
875       CI->getOperandBundlesAsDefs(OpBundles);
876 
877     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
878 
879     if (isa<FPMathOperator>(V))
880       V->copyFastMathFlags(CI);
881 
882     if (!V->getType()->isVoidTy())
883       State.set(this, V, Part);
884     State.addMetadata(V, CI);
885   }
886 }
887 
888 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
889 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
890                               VPSlotTracker &SlotTracker) const {
891   O << Indent << "WIDEN-CALL ";
892 
893   Function *CalledFn = getCalledScalarFunction();
894   if (CalledFn->getReturnType()->isVoidTy())
895     O << "void ";
896   else {
897     printAsOperand(O, SlotTracker);
898     O << " = ";
899   }
900 
901   O << "call @" << CalledFn->getName() << "(";
902   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
903     Op->printAsOperand(O, SlotTracker);
904   });
905   O << ")";
906 
907   if (VectorIntrinsicID)
908     O << " (using vector intrinsic)";
909   else {
910     O << " (using library function";
911     if (Variant->hasName())
912       O << ": " << Variant->getName();
913     O << ")";
914   }
915 }
916 
917 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
918                                 VPSlotTracker &SlotTracker) const {
919   O << Indent << "WIDEN-SELECT ";
920   printAsOperand(O, SlotTracker);
921   O << " = select ";
922   getOperand(0)->printAsOperand(O, SlotTracker);
923   O << ", ";
924   getOperand(1)->printAsOperand(O, SlotTracker);
925   O << ", ";
926   getOperand(2)->printAsOperand(O, SlotTracker);
927   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
928 }
929 #endif
930 
931 void VPWidenSelectRecipe::execute(VPTransformState &State) {
932   State.setDebugLocFrom(getDebugLoc());
933 
934   // The condition can be loop invariant but still defined inside the
935   // loop. This means that we can't just use the original 'cond' value.
936   // We have to take the 'vectorized' value and pick the first lane.
937   // Instcombine will make this a no-op.
938   auto *InvarCond =
939       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
940 
941   for (unsigned Part = 0; Part < State.UF; ++Part) {
942     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
943     Value *Op0 = State.get(getOperand(1), Part);
944     Value *Op1 = State.get(getOperand(2), Part);
945     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
946     State.set(this, Sel, Part);
947     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
948   }
949 }
950 
951 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
952     const FastMathFlags &FMF) {
953   AllowReassoc = FMF.allowReassoc();
954   NoNaNs = FMF.noNaNs();
955   NoInfs = FMF.noInfs();
956   NoSignedZeros = FMF.noSignedZeros();
957   AllowReciprocal = FMF.allowReciprocal();
958   AllowContract = FMF.allowContract();
959   ApproxFunc = FMF.approxFunc();
960 }
961 
962 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
963 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
964   switch (OpType) {
965   case OperationType::Cmp:
966     O << " " << CmpInst::getPredicateName(getPredicate());
967     break;
968   case OperationType::DisjointOp:
969     if (DisjointFlags.IsDisjoint)
970       O << " disjoint";
971     break;
972   case OperationType::PossiblyExactOp:
973     if (ExactFlags.IsExact)
974       O << " exact";
975     break;
976   case OperationType::OverflowingBinOp:
977     if (WrapFlags.HasNUW)
978       O << " nuw";
979     if (WrapFlags.HasNSW)
980       O << " nsw";
981     break;
982   case OperationType::FPMathOp:
983     getFastMathFlags().print(O);
984     break;
985   case OperationType::GEPOp:
986     if (GEPFlags.IsInBounds)
987       O << " inbounds";
988     break;
989   case OperationType::NonNegOp:
990     if (NonNegFlags.NonNeg)
991       O << " nneg";
992     break;
993   case OperationType::Other:
994     break;
995   }
996   if (getNumOperands() > 0)
997     O << " ";
998 }
999 #endif
1000 
1001 void VPWidenRecipe::execute(VPTransformState &State) {
1002   State.setDebugLocFrom(getDebugLoc());
1003   auto &Builder = State.Builder;
1004   switch (Opcode) {
1005   case Instruction::Call:
1006   case Instruction::Br:
1007   case Instruction::PHI:
1008   case Instruction::GetElementPtr:
1009   case Instruction::Select:
1010     llvm_unreachable("This instruction is handled by a different recipe.");
1011   case Instruction::UDiv:
1012   case Instruction::SDiv:
1013   case Instruction::SRem:
1014   case Instruction::URem:
1015   case Instruction::Add:
1016   case Instruction::FAdd:
1017   case Instruction::Sub:
1018   case Instruction::FSub:
1019   case Instruction::FNeg:
1020   case Instruction::Mul:
1021   case Instruction::FMul:
1022   case Instruction::FDiv:
1023   case Instruction::FRem:
1024   case Instruction::Shl:
1025   case Instruction::LShr:
1026   case Instruction::AShr:
1027   case Instruction::And:
1028   case Instruction::Or:
1029   case Instruction::Xor: {
1030     // Just widen unops and binops.
1031     for (unsigned Part = 0; Part < State.UF; ++Part) {
1032       SmallVector<Value *, 2> Ops;
1033       for (VPValue *VPOp : operands())
1034         Ops.push_back(State.get(VPOp, Part));
1035 
1036       Value *V = Builder.CreateNAryOp(Opcode, Ops);
1037 
1038       if (auto *VecOp = dyn_cast<Instruction>(V))
1039         setFlags(VecOp);
1040 
1041       // Use this vector value for all users of the original instruction.
1042       State.set(this, V, Part);
1043       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1044     }
1045 
1046     break;
1047   }
1048   case Instruction::Freeze: {
1049     for (unsigned Part = 0; Part < State.UF; ++Part) {
1050       Value *Op = State.get(getOperand(0), Part);
1051 
1052       Value *Freeze = Builder.CreateFreeze(Op);
1053       State.set(this, Freeze, Part);
1054     }
1055     break;
1056   }
1057   case Instruction::ICmp:
1058   case Instruction::FCmp: {
1059     // Widen compares. Generate vector compares.
1060     bool FCmp = Opcode == Instruction::FCmp;
1061     for (unsigned Part = 0; Part < State.UF; ++Part) {
1062       Value *A = State.get(getOperand(0), Part);
1063       Value *B = State.get(getOperand(1), Part);
1064       Value *C = nullptr;
1065       if (FCmp) {
1066         // Propagate fast math flags.
1067         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1068         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1069           Builder.setFastMathFlags(I->getFastMathFlags());
1070         C = Builder.CreateFCmp(getPredicate(), A, B);
1071       } else {
1072         C = Builder.CreateICmp(getPredicate(), A, B);
1073       }
1074       State.set(this, C, Part);
1075       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1076     }
1077 
1078     break;
1079   }
1080   default:
1081     // This instruction is not vectorized by simple widening.
1082     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1083                       << Instruction::getOpcodeName(Opcode));
1084     llvm_unreachable("Unhandled instruction!");
1085   } // end of switch.
1086 
1087 #if !defined(NDEBUG)
1088   // Verify that VPlan type inference results agree with the type of the
1089   // generated values.
1090   for (unsigned Part = 0; Part < State.UF; ++Part) {
1091     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1092                            State.VF) == State.get(this, Part)->getType() &&
1093            "inferred type and type from generated instructions do not match");
1094   }
1095 #endif
1096 }
1097 
1098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1099 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1100                           VPSlotTracker &SlotTracker) const {
1101   O << Indent << "WIDEN ";
1102   printAsOperand(O, SlotTracker);
1103   O << " = " << Instruction::getOpcodeName(Opcode);
1104   printFlags(O);
1105   printOperands(O, SlotTracker);
1106 }
1107 #endif
1108 
1109 void VPWidenCastRecipe::execute(VPTransformState &State) {
1110   State.setDebugLocFrom(getDebugLoc());
1111   auto &Builder = State.Builder;
1112   /// Vectorize casts.
1113   assert(State.VF.isVector() && "Not vectorizing?");
1114   Type *DestTy = VectorType::get(getResultType(), State.VF);
1115   VPValue *Op = getOperand(0);
1116   for (unsigned Part = 0; Part < State.UF; ++Part) {
1117     if (Part > 0 && Op->isLiveIn()) {
1118       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1119       State.set(this, State.get(this, 0), Part);
1120       continue;
1121     }
1122     Value *A = State.get(Op, Part);
1123     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1124     State.set(this, Cast, Part);
1125     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1126   }
1127 }
1128 
1129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1130 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1131                               VPSlotTracker &SlotTracker) const {
1132   O << Indent << "WIDEN-CAST ";
1133   printAsOperand(O, SlotTracker);
1134   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1135   printFlags(O);
1136   printOperands(O, SlotTracker);
1137   O << " to " << *getResultType();
1138 }
1139 #endif
1140 
1141 /// This function adds
1142 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1143 /// to each vector element of Val. The sequence starts at StartIndex.
1144 /// \p Opcode is relevant for FP induction variable.
1145 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1146                             Instruction::BinaryOps BinOp, ElementCount VF,
1147                             IRBuilderBase &Builder) {
1148   assert(VF.isVector() && "only vector VFs are supported");
1149 
1150   // Create and check the types.
1151   auto *ValVTy = cast<VectorType>(Val->getType());
1152   ElementCount VLen = ValVTy->getElementCount();
1153 
1154   Type *STy = Val->getType()->getScalarType();
1155   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1156          "Induction Step must be an integer or FP");
1157   assert(Step->getType() == STy && "Step has wrong type");
1158 
1159   SmallVector<Constant *, 8> Indices;
1160 
1161   // Create a vector of consecutive numbers from zero to VF.
1162   VectorType *InitVecValVTy = ValVTy;
1163   if (STy->isFloatingPointTy()) {
1164     Type *InitVecValSTy =
1165         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1166     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1167   }
1168   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1169 
1170   // Splat the StartIdx
1171   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1172 
1173   if (STy->isIntegerTy()) {
1174     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1175     Step = Builder.CreateVectorSplat(VLen, Step);
1176     assert(Step->getType() == Val->getType() && "Invalid step vec");
1177     // FIXME: The newly created binary instructions should contain nsw/nuw
1178     // flags, which can be found from the original scalar operations.
1179     Step = Builder.CreateMul(InitVec, Step);
1180     return Builder.CreateAdd(Val, Step, "induction");
1181   }
1182 
1183   // Floating point induction.
1184   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1185          "Binary Opcode should be specified for FP induction");
1186   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1187   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1188 
1189   Step = Builder.CreateVectorSplat(VLen, Step);
1190   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1191   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1192 }
1193 
1194 /// A helper function that returns an integer or floating-point constant with
1195 /// value C.
1196 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1197   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1198                            : ConstantFP::get(Ty, C);
1199 }
1200 
1201 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1202                                   ElementCount VF) {
1203   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1204   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1205   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1206   return B.CreateUIToFP(RuntimeVF, FTy);
1207 }
1208 
1209 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1210   assert(!State.Instance && "Int or FP induction being replicated.");
1211 
1212   Value *Start = getStartValue()->getLiveInIRValue();
1213   const InductionDescriptor &ID = getInductionDescriptor();
1214   TruncInst *Trunc = getTruncInst();
1215   IRBuilderBase &Builder = State.Builder;
1216   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1217   assert(State.VF.isVector() && "must have vector VF");
1218 
1219   // The value from the original loop to which we are mapping the new induction
1220   // variable.
1221   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1222 
1223   // Fast-math-flags propagate from the original induction instruction.
1224   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1225   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1226     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1227 
1228   // Now do the actual transformations, and start with fetching the step value.
1229   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1230 
1231   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1232          "Expected either an induction phi-node or a truncate of it!");
1233 
1234   // Construct the initial value of the vector IV in the vector loop preheader
1235   auto CurrIP = Builder.saveIP();
1236   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1237   Builder.SetInsertPoint(VectorPH->getTerminator());
1238   if (isa<TruncInst>(EntryVal)) {
1239     assert(Start->getType()->isIntegerTy() &&
1240            "Truncation requires an integer type");
1241     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1242     Step = Builder.CreateTrunc(Step, TruncType);
1243     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1244   }
1245 
1246   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1247   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1248   Value *SteppedStart = getStepVector(
1249       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1250 
1251   // We create vector phi nodes for both integer and floating-point induction
1252   // variables. Here, we determine the kind of arithmetic we will perform.
1253   Instruction::BinaryOps AddOp;
1254   Instruction::BinaryOps MulOp;
1255   if (Step->getType()->isIntegerTy()) {
1256     AddOp = Instruction::Add;
1257     MulOp = Instruction::Mul;
1258   } else {
1259     AddOp = ID.getInductionOpcode();
1260     MulOp = Instruction::FMul;
1261   }
1262 
1263   // Multiply the vectorization factor by the step using integer or
1264   // floating-point arithmetic as appropriate.
1265   Type *StepType = Step->getType();
1266   Value *RuntimeVF;
1267   if (Step->getType()->isFloatingPointTy())
1268     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1269   else
1270     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1271   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1272 
1273   // Create a vector splat to use in the induction update.
1274   //
1275   // FIXME: If the step is non-constant, we create the vector splat with
1276   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1277   //        handle a constant vector splat.
1278   Value *SplatVF = isa<Constant>(Mul)
1279                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1280                        : Builder.CreateVectorSplat(State.VF, Mul);
1281   Builder.restoreIP(CurrIP);
1282 
1283   // We may need to add the step a number of times, depending on the unroll
1284   // factor. The last of those goes into the PHI.
1285   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1286   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1287   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1288   Instruction *LastInduction = VecInd;
1289   for (unsigned Part = 0; Part < State.UF; ++Part) {
1290     State.set(this, LastInduction, Part);
1291 
1292     if (isa<TruncInst>(EntryVal))
1293       State.addMetadata(LastInduction, EntryVal);
1294 
1295     LastInduction = cast<Instruction>(
1296         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1297     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1298   }
1299 
1300   LastInduction->setName("vec.ind.next");
1301   VecInd->addIncoming(SteppedStart, VectorPH);
1302   // Add induction update using an incorrect block temporarily. The phi node
1303   // will be fixed after VPlan execution. Note that at this point the latch
1304   // block cannot be used, as it does not exist yet.
1305   // TODO: Model increment value in VPlan, by turning the recipe into a
1306   // multi-def and a subclass of VPHeaderPHIRecipe.
1307   VecInd->addIncoming(LastInduction, VectorPH);
1308 }
1309 
1310 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1311 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1312                                           VPSlotTracker &SlotTracker) const {
1313   O << Indent << "WIDEN-INDUCTION";
1314   if (getTruncInst()) {
1315     O << "\\l\"";
1316     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1317     O << " +\n" << Indent << "\"  ";
1318     getVPValue(0)->printAsOperand(O, SlotTracker);
1319   } else
1320     O << " " << VPlanIngredient(IV);
1321 
1322   O << ", ";
1323   getStepValue()->printAsOperand(O, SlotTracker);
1324 }
1325 #endif
1326 
1327 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1328   // The step may be defined by a recipe in the preheader (e.g. if it requires
1329   // SCEV expansion), but for the canonical induction the step is required to be
1330   // 1, which is represented as live-in.
1331   if (getStepValue()->getDefiningRecipe())
1332     return false;
1333   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1334   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1335   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1336   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1337          getScalarType() == CanIV->getScalarType();
1338 }
1339 
1340 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1341 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1342                               VPSlotTracker &SlotTracker) const {
1343   O << Indent;
1344   printAsOperand(O, SlotTracker);
1345   O << Indent << "= DERIVED-IV ";
1346   getStartValue()->printAsOperand(O, SlotTracker);
1347   O << " + ";
1348   getOperand(1)->printAsOperand(O, SlotTracker);
1349   O << " * ";
1350   getStepValue()->printAsOperand(O, SlotTracker);
1351 }
1352 #endif
1353 
1354 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1355   // Fast-math-flags propagate from the original induction instruction.
1356   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1357   if (hasFastMathFlags())
1358     State.Builder.setFastMathFlags(getFastMathFlags());
1359 
1360   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1361   /// variable on which to base the steps, \p Step is the size of the step.
1362 
1363   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1364   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1365   IRBuilderBase &Builder = State.Builder;
1366 
1367   // Ensure step has the same type as that of scalar IV.
1368   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1369   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1370 
1371   // We build scalar steps for both integer and floating-point induction
1372   // variables. Here, we determine the kind of arithmetic we will perform.
1373   Instruction::BinaryOps AddOp;
1374   Instruction::BinaryOps MulOp;
1375   if (BaseIVTy->isIntegerTy()) {
1376     AddOp = Instruction::Add;
1377     MulOp = Instruction::Mul;
1378   } else {
1379     AddOp = InductionOpcode;
1380     MulOp = Instruction::FMul;
1381   }
1382 
1383   // Determine the number of scalars we need to generate for each unroll
1384   // iteration.
1385   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1386   // Compute the scalar steps and save the results in State.
1387   Type *IntStepTy =
1388       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1389   Type *VecIVTy = nullptr;
1390   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1391   if (!FirstLaneOnly && State.VF.isScalable()) {
1392     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1393     UnitStepVec =
1394         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1395     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1396     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1397   }
1398 
1399   unsigned StartPart = 0;
1400   unsigned EndPart = State.UF;
1401   unsigned StartLane = 0;
1402   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1403   if (State.Instance) {
1404     StartPart = State.Instance->Part;
1405     EndPart = StartPart + 1;
1406     StartLane = State.Instance->Lane.getKnownLane();
1407     EndLane = StartLane + 1;
1408   }
1409   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1410     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1411 
1412     if (!FirstLaneOnly && State.VF.isScalable()) {
1413       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1414       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1415       if (BaseIVTy->isFloatingPointTy())
1416         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1417       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1418       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1419       State.set(this, Add, Part);
1420       // It's useful to record the lane values too for the known minimum number
1421       // of elements so we do those below. This improves the code quality when
1422       // trying to extract the first element, for example.
1423     }
1424 
1425     if (BaseIVTy->isFloatingPointTy())
1426       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1427 
1428     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1429       Value *StartIdx = Builder.CreateBinOp(
1430           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1431       // The step returned by `createStepForVF` is a runtime-evaluated value
1432       // when VF is scalable. Otherwise, it should be folded into a Constant.
1433       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1434              "Expected StartIdx to be folded to a constant when VF is not "
1435              "scalable");
1436       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1437       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1438       State.set(this, Add, VPIteration(Part, Lane));
1439     }
1440   }
1441 }
1442 
1443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1444 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1445                                   VPSlotTracker &SlotTracker) const {
1446   O << Indent;
1447   printAsOperand(O, SlotTracker);
1448   O << " = SCALAR-STEPS ";
1449   printOperands(O, SlotTracker);
1450 }
1451 #endif
1452 
1453 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1454   assert(State.VF.isVector() && "not widening");
1455   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1456   // Construct a vector GEP by widening the operands of the scalar GEP as
1457   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1458   // results in a vector of pointers when at least one operand of the GEP
1459   // is vector-typed. Thus, to keep the representation compact, we only use
1460   // vector-typed operands for loop-varying values.
1461 
1462   if (areAllOperandsInvariant()) {
1463     // If we are vectorizing, but the GEP has only loop-invariant operands,
1464     // the GEP we build (by only using vector-typed operands for
1465     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1466     // produce a vector of pointers, we need to either arbitrarily pick an
1467     // operand to broadcast, or broadcast a clone of the original GEP.
1468     // Here, we broadcast a clone of the original.
1469     //
1470     // TODO: If at some point we decide to scalarize instructions having
1471     //       loop-invariant operands, this special case will no longer be
1472     //       required. We would add the scalarization decision to
1473     //       collectLoopScalars() and teach getVectorValue() to broadcast
1474     //       the lane-zero scalar value.
1475     SmallVector<Value *> Ops;
1476     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1477       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1478 
1479     auto *NewGEP =
1480         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1481                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1482     for (unsigned Part = 0; Part < State.UF; ++Part) {
1483       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1484       State.set(this, EntryPart, Part);
1485       State.addMetadata(EntryPart, GEP);
1486     }
1487   } else {
1488     // If the GEP has at least one loop-varying operand, we are sure to
1489     // produce a vector of pointers. But if we are only unrolling, we want
1490     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1491     // produce with the code below will be scalar (if VF == 1) or vector
1492     // (otherwise). Note that for the unroll-only case, we still maintain
1493     // values in the vector mapping with initVector, as we do for other
1494     // instructions.
1495     for (unsigned Part = 0; Part < State.UF; ++Part) {
1496       // The pointer operand of the new GEP. If it's loop-invariant, we
1497       // won't broadcast it.
1498       auto *Ptr = isPointerLoopInvariant()
1499                       ? State.get(getOperand(0), VPIteration(0, 0))
1500                       : State.get(getOperand(0), Part);
1501 
1502       // Collect all the indices for the new GEP. If any index is
1503       // loop-invariant, we won't broadcast it.
1504       SmallVector<Value *, 4> Indices;
1505       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1506         VPValue *Operand = getOperand(I);
1507         if (isIndexLoopInvariant(I - 1))
1508           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1509         else
1510           Indices.push_back(State.get(Operand, Part));
1511       }
1512 
1513       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1514       // but it should be a vector, otherwise.
1515       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1516                                              Indices, "", isInBounds());
1517       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1518              "NewGEP is not a pointer vector");
1519       State.set(this, NewGEP, Part);
1520       State.addMetadata(NewGEP, GEP);
1521     }
1522   }
1523 }
1524 
1525 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1526 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1527                              VPSlotTracker &SlotTracker) const {
1528   O << Indent << "WIDEN-GEP ";
1529   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1530   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1531     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1532 
1533   O << " ";
1534   printAsOperand(O, SlotTracker);
1535   O << " = getelementptr";
1536   printFlags(O);
1537   printOperands(O, SlotTracker);
1538 }
1539 #endif
1540 
1541 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1542   auto &Builder = State.Builder;
1543   State.setDebugLocFrom(getDebugLoc());
1544   for (unsigned Part = 0; Part < State.UF; ++Part) {
1545     // Calculate the pointer for the specific unroll-part.
1546     Value *PartPtr = nullptr;
1547     // Use i32 for the gep index type when the value is constant,
1548     // or query DataLayout for a more suitable index type otherwise.
1549     const DataLayout &DL =
1550         Builder.GetInsertBlock()->getDataLayout();
1551     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1552                         ? DL.getIndexType(IndexedTy->getPointerTo())
1553                         : Builder.getInt32Ty();
1554     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1555     bool InBounds = isInBounds();
1556     if (IsReverse) {
1557       // If the address is consecutive but reversed, then the
1558       // wide store needs to start at the last vector element.
1559       // RunTimeVF =  VScale * VF.getKnownMinValue()
1560       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1561       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1562       // NumElt = -Part * RunTimeVF
1563       Value *NumElt = Builder.CreateMul(
1564           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1565       // LastLane = 1 - RunTimeVF
1566       Value *LastLane =
1567           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1568       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1569       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1570     } else {
1571       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1572       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1573     }
1574 
1575     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1576   }
1577 }
1578 
1579 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1580 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1581                                   VPSlotTracker &SlotTracker) const {
1582   O << Indent;
1583   printAsOperand(O, SlotTracker);
1584   O << " = vector-pointer ";
1585   if (IsReverse)
1586     O << "(reverse) ";
1587 
1588   printOperands(O, SlotTracker);
1589 }
1590 #endif
1591 
1592 void VPBlendRecipe::execute(VPTransformState &State) {
1593   State.setDebugLocFrom(getDebugLoc());
1594   // We know that all PHIs in non-header blocks are converted into
1595   // selects, so we don't have to worry about the insertion order and we
1596   // can just use the builder.
1597   // At this point we generate the predication tree. There may be
1598   // duplications since this is a simple recursive scan, but future
1599   // optimizations will clean it up.
1600 
1601   unsigned NumIncoming = getNumIncomingValues();
1602 
1603   // Generate a sequence of selects of the form:
1604   // SELECT(Mask3, In3,
1605   //        SELECT(Mask2, In2,
1606   //               SELECT(Mask1, In1,
1607   //                      In0)))
1608   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1609   // are essentially undef are taken from In0.
1610  VectorParts Entry(State.UF);
1611  bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1612  for (unsigned In = 0; In < NumIncoming; ++In) {
1613    for (unsigned Part = 0; Part < State.UF; ++Part) {
1614      // We might have single edge PHIs (blocks) - use an identity
1615      // 'select' for the first PHI operand.
1616      Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1617      if (In == 0)
1618        Entry[Part] = In0; // Initialize with the first incoming value.
1619      else {
1620        // Select between the current value and the previous incoming edge
1621        // based on the incoming mask.
1622        Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1623        Entry[Part] =
1624            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1625      }
1626    }
1627  }
1628   for (unsigned Part = 0; Part < State.UF; ++Part)
1629     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1630 }
1631 
1632 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1633 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1634                           VPSlotTracker &SlotTracker) const {
1635   O << Indent << "BLEND ";
1636   printAsOperand(O, SlotTracker);
1637   O << " =";
1638   if (getNumIncomingValues() == 1) {
1639     // Not a User of any mask: not really blending, this is a
1640     // single-predecessor phi.
1641     O << " ";
1642     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1643   } else {
1644     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1645       O << " ";
1646       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1647       if (I == 0)
1648         continue;
1649       O << "/";
1650       getMask(I)->printAsOperand(O, SlotTracker);
1651     }
1652   }
1653 }
1654 #endif
1655 
1656 void VPReductionRecipe::execute(VPTransformState &State) {
1657   assert(!State.Instance && "Reduction being replicated.");
1658   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1659   RecurKind Kind = RdxDesc.getRecurrenceKind();
1660   // Propagate the fast-math flags carried by the underlying instruction.
1661   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1662   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1663   for (unsigned Part = 0; Part < State.UF; ++Part) {
1664     Value *NewVecOp = State.get(getVecOp(), Part);
1665     if (VPValue *Cond = getCondOp()) {
1666       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1667       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1668       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1669       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1670                                                   RdxDesc.getFastMathFlags());
1671       if (State.VF.isVector()) {
1672         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1673       }
1674 
1675       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1676       NewVecOp = Select;
1677     }
1678     Value *NewRed;
1679     Value *NextInChain;
1680     if (IsOrdered) {
1681       if (State.VF.isVector())
1682         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1683                                         PrevInChain);
1684       else
1685         NewRed = State.Builder.CreateBinOp(
1686             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1687             NewVecOp);
1688       PrevInChain = NewRed;
1689     } else {
1690       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1691       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1692     }
1693     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1694       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1695                                    NewRed, PrevInChain);
1696     } else if (IsOrdered)
1697       NextInChain = NewRed;
1698     else
1699       NextInChain = State.Builder.CreateBinOp(
1700           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1701     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1702   }
1703 }
1704 
1705 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1706 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1707                               VPSlotTracker &SlotTracker) const {
1708   O << Indent << "REDUCE ";
1709   printAsOperand(O, SlotTracker);
1710   O << " = ";
1711   getChainOp()->printAsOperand(O, SlotTracker);
1712   O << " +";
1713   if (isa<FPMathOperator>(getUnderlyingInstr()))
1714     O << getUnderlyingInstr()->getFastMathFlags();
1715   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1716   getVecOp()->printAsOperand(O, SlotTracker);
1717   if (getCondOp()) {
1718     O << ", ";
1719     getCondOp()->printAsOperand(O, SlotTracker);
1720   }
1721   O << ")";
1722   if (RdxDesc.IntermediateStore)
1723     O << " (with final reduction value stored in invariant address sank "
1724          "outside of loop)";
1725 }
1726 #endif
1727 
1728 bool VPReplicateRecipe::shouldPack() const {
1729   // Find if the recipe is used by a widened recipe via an intervening
1730   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1731   return any_of(users(), [](const VPUser *U) {
1732     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1733       return any_of(PredR->users(), [PredR](const VPUser *U) {
1734         return !U->usesScalars(PredR);
1735       });
1736     return false;
1737   });
1738 }
1739 
1740 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1741 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1742                               VPSlotTracker &SlotTracker) const {
1743   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1744 
1745   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1746     printAsOperand(O, SlotTracker);
1747     O << " = ";
1748   }
1749   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1750     O << "call";
1751     printFlags(O);
1752     O << "@" << CB->getCalledFunction()->getName() << "(";
1753     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1754                     O, [&O, &SlotTracker](VPValue *Op) {
1755                       Op->printAsOperand(O, SlotTracker);
1756                     });
1757     O << ")";
1758   } else {
1759     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1760     printFlags(O);
1761     printOperands(O, SlotTracker);
1762   }
1763 
1764   if (shouldPack())
1765     O << " (S->V)";
1766 }
1767 #endif
1768 
1769 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1770 /// if it is either defined outside the vector region or its operand is known to
1771 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1772 /// TODO: Uniformity should be associated with a VPValue and there should be a
1773 /// generic way to check.
1774 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1775   return C->isDefinedOutsideVectorRegions() ||
1776          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1777          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1778 }
1779 
1780 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1781   assert(vputils::onlyFirstLaneUsed(this) &&
1782          "Codegen only implemented for first lane.");
1783   switch (Opcode) {
1784   case Instruction::SExt:
1785   case Instruction::ZExt:
1786   case Instruction::Trunc: {
1787     // Note: SExt/ZExt not used yet.
1788     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1789     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1790   }
1791   default:
1792     llvm_unreachable("opcode not implemented yet");
1793   }
1794 }
1795 
1796 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1797   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1798   for (unsigned Part = 0; Part != State.UF; ++Part) {
1799     Value *Res;
1800     // Only generate a single instance, if the recipe is uniform across UFs and
1801     // VFs.
1802     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1803       Res = State.get(this, VPIteration(0, 0));
1804     else
1805       Res = generate(State, Part);
1806     State.set(this, Res, VPIteration(Part, 0));
1807   }
1808 }
1809 
1810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1811 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1812                                 VPSlotTracker &SlotTracker) const {
1813   O << Indent << "SCALAR-CAST ";
1814   printAsOperand(O, SlotTracker);
1815   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1816   printOperands(O, SlotTracker);
1817   O << " to " << *ResultTy;
1818 }
1819 #endif
1820 
1821 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1822   assert(State.Instance && "Branch on Mask works only on single instance.");
1823 
1824   unsigned Part = State.Instance->Part;
1825   unsigned Lane = State.Instance->Lane.getKnownLane();
1826 
1827   Value *ConditionBit = nullptr;
1828   VPValue *BlockInMask = getMask();
1829   if (BlockInMask) {
1830     ConditionBit = State.get(BlockInMask, Part);
1831     if (ConditionBit->getType()->isVectorTy())
1832       ConditionBit = State.Builder.CreateExtractElement(
1833           ConditionBit, State.Builder.getInt32(Lane));
1834   } else // Block in mask is all-one.
1835     ConditionBit = State.Builder.getTrue();
1836 
1837   // Replace the temporary unreachable terminator with a new conditional branch,
1838   // whose two destinations will be set later when they are created.
1839   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1840   assert(isa<UnreachableInst>(CurrentTerminator) &&
1841          "Expected to replace unreachable terminator with conditional branch.");
1842   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1843   CondBr->setSuccessor(0, nullptr);
1844   ReplaceInstWithInst(CurrentTerminator, CondBr);
1845 }
1846 
1847 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1848   assert(State.Instance && "Predicated instruction PHI works per instance.");
1849   Instruction *ScalarPredInst =
1850       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1851   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1852   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1853   assert(PredicatingBB && "Predicated block has no single predecessor.");
1854   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1855          "operand must be VPReplicateRecipe");
1856 
1857   // By current pack/unpack logic we need to generate only a single phi node: if
1858   // a vector value for the predicated instruction exists at this point it means
1859   // the instruction has vector users only, and a phi for the vector value is
1860   // needed. In this case the recipe of the predicated instruction is marked to
1861   // also do that packing, thereby "hoisting" the insert-element sequence.
1862   // Otherwise, a phi node for the scalar value is needed.
1863   unsigned Part = State.Instance->Part;
1864   if (State.hasVectorValue(getOperand(0), Part)) {
1865     Value *VectorValue = State.get(getOperand(0), Part);
1866     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1867     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1868     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1869     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1870     if (State.hasVectorValue(this, Part))
1871       State.reset(this, VPhi, Part);
1872     else
1873       State.set(this, VPhi, Part);
1874     // NOTE: Currently we need to update the value of the operand, so the next
1875     // predicated iteration inserts its generated value in the correct vector.
1876     State.reset(getOperand(0), VPhi, Part);
1877   } else {
1878     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1879     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1880     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1881                      PredicatingBB);
1882     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1883     if (State.hasScalarValue(this, *State.Instance))
1884       State.reset(this, Phi, *State.Instance);
1885     else
1886       State.set(this, Phi, *State.Instance);
1887     // NOTE: Currently we need to update the value of the operand, so the next
1888     // predicated iteration inserts its generated value in the correct vector.
1889     State.reset(getOperand(0), Phi, *State.Instance);
1890   }
1891 }
1892 
1893 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1894 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1895                                 VPSlotTracker &SlotTracker) const {
1896   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1897   printAsOperand(O, SlotTracker);
1898   O << " = ";
1899   printOperands(O, SlotTracker);
1900 }
1901 
1902 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1903                               VPSlotTracker &SlotTracker) const {
1904   O << Indent << "WIDEN ";
1905   printAsOperand(O, SlotTracker);
1906   O << " = load ";
1907   printOperands(O, SlotTracker);
1908 }
1909 
1910 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1911                                  VPSlotTracker &SlotTracker) const {
1912   O << Indent << "WIDEN ";
1913   printAsOperand(O, SlotTracker);
1914   O << " = vp.load ";
1915   printOperands(O, SlotTracker);
1916 }
1917 
1918 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1919                                VPSlotTracker &SlotTracker) const {
1920   O << Indent << "WIDEN store ";
1921   printOperands(O, SlotTracker);
1922 }
1923 
1924 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1925                                   VPSlotTracker &SlotTracker) const {
1926   O << Indent << "WIDEN vp.store ";
1927   printOperands(O, SlotTracker);
1928 }
1929 #endif
1930 
1931 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1932   Value *Start = getStartValue()->getLiveInIRValue();
1933   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1934   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1935 
1936   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1937   EntryPart->addIncoming(Start, VectorPH);
1938   EntryPart->setDebugLoc(getDebugLoc());
1939   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1940     State.set(this, EntryPart, Part, /*IsScalar*/ true);
1941 }
1942 
1943 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1944 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1945                                    VPSlotTracker &SlotTracker) const {
1946   O << Indent << "EMIT ";
1947   printAsOperand(O, SlotTracker);
1948   O << " = CANONICAL-INDUCTION ";
1949   printOperands(O, SlotTracker);
1950 }
1951 #endif
1952 
1953 bool VPCanonicalIVPHIRecipe::isCanonical(
1954     InductionDescriptor::InductionKind Kind, VPValue *Start,
1955     VPValue *Step) const {
1956   // Must be an integer induction.
1957   if (Kind != InductionDescriptor::IK_IntInduction)
1958     return false;
1959   // Start must match the start value of this canonical induction.
1960   if (Start != getStartValue())
1961     return false;
1962 
1963   // If the step is defined by a recipe, it is not a ConstantInt.
1964   if (Step->getDefiningRecipe())
1965     return false;
1966 
1967   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1968   return StepC && StepC->isOne();
1969 }
1970 
1971 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1972   return IsScalarAfterVectorization &&
1973          (!IsScalable || vputils::onlyFirstLaneUsed(this));
1974 }
1975 
1976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1977 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1978                                           VPSlotTracker &SlotTracker) const {
1979   O << Indent << "EMIT ";
1980   printAsOperand(O, SlotTracker);
1981   O << " = WIDEN-POINTER-INDUCTION ";
1982   getStartValue()->printAsOperand(O, SlotTracker);
1983   O << ", " << *IndDesc.getStep();
1984 }
1985 #endif
1986 
1987 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1988   assert(!State.Instance && "cannot be used in per-lane");
1989   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
1990   SCEVExpander Exp(SE, DL, "induction");
1991 
1992   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1993                                  &*State.Builder.GetInsertPoint());
1994   assert(!State.ExpandedSCEVs.contains(Expr) &&
1995          "Same SCEV expanded multiple times");
1996   State.ExpandedSCEVs[Expr] = Res;
1997   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1998     State.set(this, Res, {Part, 0});
1999 }
2000 
2001 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2002 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2003                                VPSlotTracker &SlotTracker) const {
2004   O << Indent << "EMIT ";
2005   getVPSingleValue()->printAsOperand(O, SlotTracker);
2006   O << " = EXPAND SCEV " << *Expr;
2007 }
2008 #endif
2009 
2010 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2011   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2012   Type *STy = CanonicalIV->getType();
2013   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2014   ElementCount VF = State.VF;
2015   Value *VStart = VF.isScalar()
2016                       ? CanonicalIV
2017                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2018   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2019     Value *VStep = createStepForVF(Builder, STy, VF, Part);
2020     if (VF.isVector()) {
2021       VStep = Builder.CreateVectorSplat(VF, VStep);
2022       VStep =
2023           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2024     }
2025     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2026     State.set(this, CanonicalVectorIV, Part);
2027   }
2028 }
2029 
2030 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2031 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2032                                      VPSlotTracker &SlotTracker) const {
2033   O << Indent << "EMIT ";
2034   printAsOperand(O, SlotTracker);
2035   O << " = WIDEN-CANONICAL-INDUCTION ";
2036   printOperands(O, SlotTracker);
2037 }
2038 #endif
2039 
2040 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2041   auto &Builder = State.Builder;
2042   // Create a vector from the initial value.
2043   auto *VectorInit = getStartValue()->getLiveInIRValue();
2044 
2045   Type *VecTy = State.VF.isScalar()
2046                     ? VectorInit->getType()
2047                     : VectorType::get(VectorInit->getType(), State.VF);
2048 
2049   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2050   if (State.VF.isVector()) {
2051     auto *IdxTy = Builder.getInt32Ty();
2052     auto *One = ConstantInt::get(IdxTy, 1);
2053     IRBuilder<>::InsertPointGuard Guard(Builder);
2054     Builder.SetInsertPoint(VectorPH->getTerminator());
2055     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2056     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2057     VectorInit = Builder.CreateInsertElement(
2058         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2059   }
2060 
2061   // Create a phi node for the new recurrence.
2062   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
2063   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2064   EntryPart->addIncoming(VectorInit, VectorPH);
2065   State.set(this, EntryPart, 0);
2066 }
2067 
2068 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2069 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2070                                             VPSlotTracker &SlotTracker) const {
2071   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2072   printAsOperand(O, SlotTracker);
2073   O << " = phi ";
2074   printOperands(O, SlotTracker);
2075 }
2076 #endif
2077 
2078 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2079   auto &Builder = State.Builder;
2080 
2081   // Reductions do not have to start at zero. They can start with
2082   // any loop invariant values.
2083   VPValue *StartVPV = getStartValue();
2084   Value *StartV = StartVPV->getLiveInIRValue();
2085 
2086   // In order to support recurrences we need to be able to vectorize Phi nodes.
2087   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2088   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2089   // this value when we vectorize all of the instructions that use the PHI.
2090   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2091   Type *VecTy = ScalarPHI ? StartV->getType()
2092                           : VectorType::get(StartV->getType(), State.VF);
2093 
2094   BasicBlock *HeaderBB = State.CFG.PrevBB;
2095   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2096          "recipe must be in the vector loop header");
2097   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2098   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2099     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2100     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2101     State.set(this, EntryPart, Part, IsInLoop);
2102   }
2103 
2104   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2105 
2106   Value *Iden = nullptr;
2107   RecurKind RK = RdxDesc.getRecurrenceKind();
2108   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2109       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2110     // MinMax and AnyOf reductions have the start value as their identity.
2111     if (ScalarPHI) {
2112       Iden = StartV;
2113     } else {
2114       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2115       Builder.SetInsertPoint(VectorPH->getTerminator());
2116       StartV = Iden =
2117           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2118     }
2119   } else {
2120     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2121                                          RdxDesc.getFastMathFlags());
2122 
2123     if (!ScalarPHI) {
2124       Iden = Builder.CreateVectorSplat(State.VF, Iden);
2125       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2126       Builder.SetInsertPoint(VectorPH->getTerminator());
2127       Constant *Zero = Builder.getInt32(0);
2128       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2129     }
2130   }
2131 
2132   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2133     Value *EntryPart = State.get(this, Part, IsInLoop);
2134     // Make sure to add the reduction start value only to the
2135     // first unroll part.
2136     Value *StartVal = (Part == 0) ? StartV : Iden;
2137     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2138   }
2139 }
2140 
2141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2142 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2143                                  VPSlotTracker &SlotTracker) const {
2144   O << Indent << "WIDEN-REDUCTION-PHI ";
2145 
2146   printAsOperand(O, SlotTracker);
2147   O << " = phi ";
2148   printOperands(O, SlotTracker);
2149 }
2150 #endif
2151 
2152 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2153   assert(EnableVPlanNativePath &&
2154          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2155 
2156   Value *Op0 = State.get(getOperand(0), 0);
2157   Type *VecTy = Op0->getType();
2158   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2159   State.set(this, VecPhi, 0);
2160 }
2161 
2162 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2163 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2164                              VPSlotTracker &SlotTracker) const {
2165   O << Indent << "WIDEN-PHI ";
2166 
2167   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2168   // Unless all incoming values are modeled in VPlan  print the original PHI
2169   // directly.
2170   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2171   // values as VPValues.
2172   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2173     O << VPlanIngredient(OriginalPhi);
2174     return;
2175   }
2176 
2177   printAsOperand(O, SlotTracker);
2178   O << " = phi ";
2179   printOperands(O, SlotTracker);
2180 }
2181 #endif
2182 
2183 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2184 // remove VPActiveLaneMaskPHIRecipe.
2185 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2186   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2187   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2188     Value *StartMask = State.get(getOperand(0), Part);
2189     PHINode *EntryPart =
2190         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2191     EntryPart->addIncoming(StartMask, VectorPH);
2192     EntryPart->setDebugLoc(getDebugLoc());
2193     State.set(this, EntryPart, Part);
2194   }
2195 }
2196 
2197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2198 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2199                                       VPSlotTracker &SlotTracker) const {
2200   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2201 
2202   printAsOperand(O, SlotTracker);
2203   O << " = phi ";
2204   printOperands(O, SlotTracker);
2205 }
2206 #endif
2207 
2208 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2209   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2210   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2211   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2212   PHINode *EntryPart =
2213       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2214   EntryPart->addIncoming(Start, VectorPH);
2215   EntryPart->setDebugLoc(getDebugLoc());
2216   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2217 }
2218 
2219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2220 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2221                                   VPSlotTracker &SlotTracker) const {
2222   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2223 
2224   printAsOperand(O, SlotTracker);
2225   O << " = phi ";
2226   printOperands(O, SlotTracker);
2227 }
2228 #endif
2229