xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 3fbf6f8bb183ad8b9157e50c442479f4ca7a9b8d)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Type.h"
26 #include "llvm/IR/Value.h"
27 #include "llvm/IR/VectorBuilder.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
33 #include "llvm/Transforms/Utils/LoopUtils.h"
34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
35 #include <cassert>
36 
37 using namespace llvm;
38 
39 using VectorParts = SmallVector<Value *, 2>;
40 
41 namespace llvm {
42 extern cl::opt<bool> EnableVPlanNativePath;
43 }
44 extern cl::opt<unsigned> ForceTargetInstructionCost;
45 
46 #define LV_NAME "loop-vectorize"
47 #define DEBUG_TYPE LV_NAME
48 
49 bool VPRecipeBase::mayWriteToMemory() const {
50   switch (getVPDefID()) {
51   case VPInterleaveSC:
52     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
53   case VPWidenStoreEVLSC:
54   case VPWidenStoreSC:
55     return true;
56   case VPReplicateSC:
57     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
58         ->mayWriteToMemory();
59   case VPWidenCallSC:
60     return !cast<VPWidenCallRecipe>(this)
61                 ->getCalledScalarFunction()
62                 ->onlyReadsMemory();
63   case VPBranchOnMaskSC:
64   case VPScalarIVStepsSC:
65   case VPPredInstPHISC:
66     return false;
67   case VPBlendSC:
68   case VPReductionEVLSC:
69   case VPReductionSC:
70   case VPWidenCanonicalIVSC:
71   case VPWidenCastSC:
72   case VPWidenGEPSC:
73   case VPWidenIntOrFpInductionSC:
74   case VPWidenLoadEVLSC:
75   case VPWidenLoadSC:
76   case VPWidenPHISC:
77   case VPWidenSC:
78   case VPWidenEVLSC:
79   case VPWidenSelectSC: {
80     const Instruction *I =
81         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
82     (void)I;
83     assert((!I || !I->mayWriteToMemory()) &&
84            "underlying instruction may write to memory");
85     return false;
86   }
87   default:
88     return true;
89   }
90 }
91 
92 bool VPRecipeBase::mayReadFromMemory() const {
93   switch (getVPDefID()) {
94   case VPWidenLoadEVLSC:
95   case VPWidenLoadSC:
96     return true;
97   case VPReplicateSC:
98     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
99         ->mayReadFromMemory();
100   case VPWidenCallSC:
101     return !cast<VPWidenCallRecipe>(this)
102                 ->getCalledScalarFunction()
103                 ->onlyWritesMemory();
104   case VPBranchOnMaskSC:
105   case VPPredInstPHISC:
106   case VPScalarIVStepsSC:
107   case VPWidenStoreEVLSC:
108   case VPWidenStoreSC:
109     return false;
110   case VPBlendSC:
111   case VPReductionEVLSC:
112   case VPReductionSC:
113   case VPWidenCanonicalIVSC:
114   case VPWidenCastSC:
115   case VPWidenGEPSC:
116   case VPWidenIntOrFpInductionSC:
117   case VPWidenPHISC:
118   case VPWidenSC:
119   case VPWidenEVLSC:
120   case VPWidenSelectSC: {
121     const Instruction *I =
122         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123     (void)I;
124     assert((!I || !I->mayReadFromMemory()) &&
125            "underlying instruction may read from memory");
126     return false;
127   }
128   default:
129     return true;
130   }
131 }
132 
133 bool VPRecipeBase::mayHaveSideEffects() const {
134   switch (getVPDefID()) {
135   case VPDerivedIVSC:
136   case VPPredInstPHISC:
137   case VPScalarCastSC:
138     return false;
139   case VPInstructionSC:
140     switch (cast<VPInstruction>(this)->getOpcode()) {
141     case Instruction::Or:
142     case Instruction::ICmp:
143     case Instruction::Select:
144     case VPInstruction::Not:
145     case VPInstruction::CalculateTripCountMinusVF:
146     case VPInstruction::CanonicalIVIncrementForPart:
147     case VPInstruction::ExtractFromEnd:
148     case VPInstruction::FirstOrderRecurrenceSplice:
149     case VPInstruction::LogicalAnd:
150     case VPInstruction::PtrAdd:
151       return false;
152     default:
153       return true;
154     }
155   case VPWidenCallSC: {
156     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
157     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
158   }
159   case VPBlendSC:
160   case VPReductionEVLSC:
161   case VPReductionSC:
162   case VPScalarIVStepsSC:
163   case VPWidenCanonicalIVSC:
164   case VPWidenCastSC:
165   case VPWidenGEPSC:
166   case VPWidenIntOrFpInductionSC:
167   case VPWidenPHISC:
168   case VPWidenPointerInductionSC:
169   case VPWidenSC:
170   case VPWidenEVLSC:
171   case VPWidenSelectSC: {
172     const Instruction *I =
173         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
174     (void)I;
175     assert((!I || !I->mayHaveSideEffects()) &&
176            "underlying instruction has side-effects");
177     return false;
178   }
179   case VPInterleaveSC:
180     return mayWriteToMemory();
181   case VPWidenLoadEVLSC:
182   case VPWidenLoadSC:
183   case VPWidenStoreEVLSC:
184   case VPWidenStoreSC:
185     assert(
186         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
187             mayWriteToMemory() &&
188         "mayHaveSideffects result for ingredient differs from this "
189         "implementation");
190     return mayWriteToMemory();
191   case VPReplicateSC: {
192     auto *R = cast<VPReplicateRecipe>(this);
193     return R->getUnderlyingInstr()->mayHaveSideEffects();
194   }
195   default:
196     return true;
197   }
198 }
199 
200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
201   VPValue *ExitValue = getOperand(0);
202   VPBasicBlock *MiddleVPBB =
203       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
204   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
205   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
206   // Values leaving the vector loop reach live out phi's in the exiting block
207   // via middle block.
208   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
209                        ? MiddleVPBB
210                        : ExitingVPBB;
211   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
212   Value *V = State.get(ExitValue, VPIteration(0, 0));
213   if (Phi->getBasicBlockIndex(PredBB) != -1)
214     Phi->setIncomingValueForBlock(PredBB, V);
215   else
216     Phi->addIncoming(V, PredBB);
217 }
218 
219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
221   O << "Live-out ";
222   getPhi()->printAsOperand(O);
223   O << " = ";
224   getOperand(0)->printAsOperand(O, SlotTracker);
225   O << "\n";
226 }
227 #endif
228 
229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
230   assert(!Parent && "Recipe already in some VPBasicBlock");
231   assert(InsertPos->getParent() &&
232          "Insertion position not in any VPBasicBlock");
233   InsertPos->getParent()->insert(this, InsertPos->getIterator());
234 }
235 
236 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237                                 iplist<VPRecipeBase>::iterator I) {
238   assert(!Parent && "Recipe already in some VPBasicBlock");
239   assert(I == BB.end() || I->getParent() == &BB);
240   BB.insert(this, I);
241 }
242 
243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
244   assert(!Parent && "Recipe already in some VPBasicBlock");
245   assert(InsertPos->getParent() &&
246          "Insertion position not in any VPBasicBlock");
247   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
248 }
249 
250 void VPRecipeBase::removeFromParent() {
251   assert(getParent() && "Recipe not in any VPBasicBlock");
252   getParent()->getRecipeList().remove(getIterator());
253   Parent = nullptr;
254 }
255 
256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
257   assert(getParent() && "Recipe not in any VPBasicBlock");
258   return getParent()->getRecipeList().erase(getIterator());
259 }
260 
261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
262   removeFromParent();
263   insertAfter(InsertPos);
264 }
265 
266 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
267                               iplist<VPRecipeBase>::iterator I) {
268   removeFromParent();
269   insertBefore(BB, I);
270 }
271 
272 /// Return the underlying instruction to be used for computing \p R's cost via
273 /// the legacy cost model. Return nullptr if there's no suitable instruction.
274 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
275   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
276     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
277   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
278     return IG->getInsertPos();
279   // Currently the legacy cost model only calculates the instruction cost with
280   // underlying instruction. Removing the WidenMem here will prevent
281   // force-target-instruction-cost overwriting the cost of recipe with
282   // underlying instruction which is inconsistent with the legacy model.
283   // TODO: Remove WidenMem from this function when we don't need to compare to
284   // the legacy model.
285   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
286     return &WidenMem->getIngredient();
287   return nullptr;
288 }
289 
290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
291   auto *UI = getInstructionForCost(this);
292   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
293     return 0;
294 
295   InstructionCost RecipeCost = computeCost(VF, Ctx);
296   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
297       RecipeCost.isValid())
298     RecipeCost = InstructionCost(ForceTargetInstructionCost);
299 
300   LLVM_DEBUG({
301     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302     dump();
303   });
304   return RecipeCost;
305 }
306 
307 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
308                                           VPCostContext &Ctx) const {
309   // Compute the cost for the recipe falling back to the legacy cost model using
310   // the underlying instruction. If there is no underlying instruction, returns
311   // 0.
312   Instruction *UI = getInstructionForCost(this);
313   if (UI && isa<VPReplicateRecipe>(this)) {
314     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
315     // transform, avoid computing their cost multiple times for now.
316     Ctx.SkipCostComputation.insert(UI);
317   }
318   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
319 }
320 
321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
322   assert(OpType == OperationType::FPMathOp &&
323          "recipe doesn't have fast math flags");
324   FastMathFlags Res;
325   Res.setAllowReassoc(FMFs.AllowReassoc);
326   Res.setNoNaNs(FMFs.NoNaNs);
327   Res.setNoInfs(FMFs.NoInfs);
328   Res.setNoSignedZeros(FMFs.NoSignedZeros);
329   Res.setAllowReciprocal(FMFs.AllowReciprocal);
330   Res.setAllowContract(FMFs.AllowContract);
331   Res.setApproxFunc(FMFs.ApproxFunc);
332   return Res;
333 }
334 
335 template <unsigned PartOpIdx>
336 VPValue *
337 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
338   if (U.getNumOperands() == PartOpIdx + 1)
339     return U.getOperand(PartOpIdx);
340   return nullptr;
341 }
342 
343 template <unsigned PartOpIdx>
344 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
345   if (auto *UnrollPartOp = getUnrollPartOperand(U))
346     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
347   return 0;
348 }
349 
350 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
351                              VPValue *A, VPValue *B, DebugLoc DL,
352                              const Twine &Name)
353     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
354                           Pred, DL),
355       Opcode(Opcode), Name(Name.str()) {
356   assert(Opcode == Instruction::ICmp &&
357          "only ICmp predicates supported at the moment");
358 }
359 
360 VPInstruction::VPInstruction(unsigned Opcode,
361                              std::initializer_list<VPValue *> Operands,
362                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
363     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
364       Opcode(Opcode), Name(Name.str()) {
365   // Make sure the VPInstruction is a floating-point operation.
366   assert(isFPMathOp() && "this op can't take fast-math flags");
367 }
368 
369 bool VPInstruction::doesGeneratePerAllLanes() const {
370   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
371 }
372 
373 bool VPInstruction::canGenerateScalarForFirstLane() const {
374   if (Instruction::isBinaryOp(getOpcode()))
375     return true;
376   if (isSingleScalar() || isVectorToScalar())
377     return true;
378   switch (Opcode) {
379   case Instruction::ICmp:
380   case VPInstruction::BranchOnCond:
381   case VPInstruction::BranchOnCount:
382   case VPInstruction::CalculateTripCountMinusVF:
383   case VPInstruction::CanonicalIVIncrementForPart:
384   case VPInstruction::PtrAdd:
385   case VPInstruction::ExplicitVectorLength:
386     return true;
387   default:
388     return false;
389   }
390 }
391 
392 Value *VPInstruction::generatePerLane(VPTransformState &State,
393                                       const VPIteration &Lane) {
394   IRBuilderBase &Builder = State.Builder;
395 
396   assert(getOpcode() == VPInstruction::PtrAdd &&
397          "only PtrAdd opcodes are supported for now");
398   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
399                               State.get(getOperand(1), Lane), Name);
400 }
401 
402 Value *VPInstruction::generate(VPTransformState &State) {
403   IRBuilderBase &Builder = State.Builder;
404 
405   if (Instruction::isBinaryOp(getOpcode())) {
406     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
407     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
408     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
409     auto *Res =
410         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
411     if (auto *I = dyn_cast<Instruction>(Res))
412       setFlags(I);
413     return Res;
414   }
415 
416   switch (getOpcode()) {
417   case VPInstruction::Not: {
418     Value *A = State.get(getOperand(0));
419     return Builder.CreateNot(A, Name);
420   }
421   case Instruction::ICmp: {
422     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
423     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
424     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
425     return Builder.CreateCmp(getPredicate(), A, B, Name);
426   }
427   case Instruction::Select: {
428     Value *Cond = State.get(getOperand(0));
429     Value *Op1 = State.get(getOperand(1));
430     Value *Op2 = State.get(getOperand(2));
431     return Builder.CreateSelect(Cond, Op1, Op2, Name);
432   }
433   case VPInstruction::ActiveLaneMask: {
434     // Get first lane of vector induction variable.
435     Value *VIVElem0 = State.get(getOperand(0), VPIteration(0, 0));
436     // Get the original loop tripcount.
437     Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
438 
439     // If this part of the active lane mask is scalar, generate the CMP directly
440     // to avoid unnecessary extracts.
441     if (State.VF.isScalar())
442       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
443                                Name);
444 
445     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
446     auto *PredTy = VectorType::get(Int1Ty, State.VF);
447     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
448                                    {PredTy, ScalarTC->getType()},
449                                    {VIVElem0, ScalarTC}, nullptr, Name);
450   }
451   case VPInstruction::FirstOrderRecurrenceSplice: {
452     // Generate code to combine the previous and current values in vector v3.
453     //
454     //   vector.ph:
455     //     v_init = vector(..., ..., ..., a[-1])
456     //     br vector.body
457     //
458     //   vector.body
459     //     i = phi [0, vector.ph], [i+4, vector.body]
460     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
461     //     v2 = a[i, i+1, i+2, i+3];
462     //     v3 = vector(v1(3), v2(0, 1, 2))
463 
464     auto *V1 = State.get(getOperand(0));
465     if (!V1->getType()->isVectorTy())
466       return V1;
467     Value *V2 = State.get(getOperand(1));
468     return Builder.CreateVectorSplice(V1, V2, -1, Name);
469   }
470   case VPInstruction::CalculateTripCountMinusVF: {
471     unsigned UF = getParent()->getPlan()->getUF();
472     Value *ScalarTC = State.get(getOperand(0), {0, 0});
473     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
474     Value *Sub = Builder.CreateSub(ScalarTC, Step);
475     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
476     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
477     return Builder.CreateSelect(Cmp, Sub, Zero);
478   }
479   case VPInstruction::ExplicitVectorLength: {
480     // Compute EVL
481     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
482       assert(AVL->getType()->isIntegerTy() &&
483              "Requested vector length should be an integer.");
484 
485       // TODO: Add support for MaxSafeDist for correct loop emission.
486       assert(State.VF.isScalable() && "Expected scalable vector factor.");
487       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
488 
489       Value *EVL = State.Builder.CreateIntrinsic(
490           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
491           {AVL, VFArg, State.Builder.getTrue()});
492       return EVL;
493     };
494     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
495     // be outside of the main loop.
496     // Compute VTC - IV as the AVL (requested vector length).
497     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
498     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
499     Value *AVL = State.Builder.CreateSub(TripCount, Index);
500     Value *EVL = GetEVL(State, AVL);
501     return EVL;
502   }
503   case VPInstruction::CanonicalIVIncrementForPart: {
504     unsigned Part = getUnrollPart(*this);
505     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
506     assert(Part != 0 && "Must have a positive part");
507     // The canonical IV is incremented by the vectorization factor (num of
508     // SIMD elements) times the unroll part.
509     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
510     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
511                              hasNoSignedWrap());
512   }
513   case VPInstruction::BranchOnCond: {
514     Value *Cond = State.get(getOperand(0), VPIteration(0, 0));
515     // Replace the temporary unreachable terminator with a new conditional
516     // branch, hooking it up to backward destination for exiting blocks now and
517     // to forward destination(s) later when they are created.
518     BranchInst *CondBr =
519         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
520     CondBr->setSuccessor(0, nullptr);
521     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
522 
523     if (!getParent()->isExiting())
524       return CondBr;
525 
526     VPRegionBlock *ParentRegion = getParent()->getParent();
527     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
528     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
529     return CondBr;
530   }
531   case VPInstruction::BranchOnCount: {
532     // First create the compare.
533     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
534     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
535     Value *Cond = Builder.CreateICmpEQ(IV, TC);
536 
537     // Now create the branch.
538     auto *Plan = getParent()->getPlan();
539     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
540     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
541 
542     // Replace the temporary unreachable terminator with a new conditional
543     // branch, hooking it up to backward destination (the header) now and to the
544     // forward destination (the exit/middle block) later when it is created.
545     // Note that CreateCondBr expects a valid BB as first argument, so we need
546     // to set it to nullptr later.
547     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
548                                               State.CFG.VPBB2IRBB[Header]);
549     CondBr->setSuccessor(0, nullptr);
550     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
551     return CondBr;
552   }
553   case VPInstruction::ComputeReductionResult: {
554     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
555     // and will be removed by breaking up the recipe further.
556     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
557     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
558     // Get its reduction variable descriptor.
559     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
560 
561     RecurKind RK = RdxDesc.getRecurrenceKind();
562 
563     Type *PhiTy = OrigPhi->getType();
564     // The recipe's operands are the reduction phi, followed by one operand for
565     // each part of the reduction.
566     unsigned UF = getNumOperands() - 1;
567     VectorParts RdxParts(UF);
568     for (unsigned Part = 0; Part < UF; ++Part)
569       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
570 
571     // If the vector reduction can be performed in a smaller type, we truncate
572     // then extend the loop exit value to enable InstCombine to evaluate the
573     // entire expression in the smaller type.
574     // TODO: Handle this in truncateToMinBW.
575     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
576       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
577       for (unsigned Part = 0; Part < UF; ++Part)
578         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
579     }
580     // Reduce all of the unrolled parts into a single vector.
581     Value *ReducedPartRdx = RdxParts[0];
582     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
583     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
584       Op = Instruction::Or;
585 
586     if (PhiR->isOrdered()) {
587       ReducedPartRdx = RdxParts[UF - 1];
588     } else {
589       // Floating-point operations should have some FMF to enable the reduction.
590       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
591       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
592       for (unsigned Part = 1; Part < UF; ++Part) {
593         Value *RdxPart = RdxParts[Part];
594         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
595           ReducedPartRdx = Builder.CreateBinOp(
596               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
597         else
598           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
599       }
600     }
601 
602     // Create the reduction after the loop. Note that inloop reductions create
603     // the target reduction in the loop using a Reduction recipe.
604     if ((State.VF.isVector() ||
605          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
606         !PhiR->isInLoop()) {
607       ReducedPartRdx =
608           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
609       // If the reduction can be performed in a smaller type, we need to extend
610       // the reduction to the wider type before we branch to the original loop.
611       if (PhiTy != RdxDesc.getRecurrenceType())
612         ReducedPartRdx = RdxDesc.isSigned()
613                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
614                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
615     }
616 
617     // If there were stores of the reduction value to a uniform memory address
618     // inside the loop, create the final store here.
619     if (StoreInst *SI = RdxDesc.IntermediateStore) {
620       auto *NewSI = Builder.CreateAlignedStore(
621           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
622       propagateMetadata(NewSI, SI);
623     }
624 
625     return ReducedPartRdx;
626   }
627   case VPInstruction::ExtractFromEnd: {
628     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
629     unsigned Offset = CI->getZExtValue();
630     assert(Offset > 0 && "Offset from end must be positive");
631     Value *Res;
632     if (State.VF.isVector()) {
633       assert(Offset <= State.VF.getKnownMinValue() &&
634              "invalid offset to extract from");
635       // Extract lane VF - Offset from the operand.
636       Res = State.get(getOperand(0),
637                       VPIteration(0, VPLane::getLaneFromEnd(State.VF, Offset)));
638     } else {
639       assert(Offset <= 1 && "invalid offset to extract from");
640       Res = State.get(getOperand(0));
641     }
642     if (isa<ExtractElementInst>(Res))
643       Res->setName(Name);
644     return Res;
645   }
646   case VPInstruction::LogicalAnd: {
647     Value *A = State.get(getOperand(0));
648     Value *B = State.get(getOperand(1));
649     return Builder.CreateLogicalAnd(A, B, Name);
650   }
651   case VPInstruction::PtrAdd: {
652     assert(vputils::onlyFirstLaneUsed(this) &&
653            "can only generate first lane for PtrAdd");
654     Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
655     Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
656     return Builder.CreatePtrAdd(Ptr, Addend, Name);
657   }
658   case VPInstruction::ResumePhi: {
659     Value *IncomingFromVPlanPred =
660         State.get(getOperand(0), /* IsScalar */ true);
661     Value *IncomingFromOtherPreds =
662         State.get(getOperand(1), /* IsScalar */ true);
663     auto *NewPhi =
664         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
665     BasicBlock *VPlanPred =
666         State.CFG
667             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
668     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
669     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
670       assert(OtherPred != VPlanPred &&
671              "VPlan predecessors should not be connected yet");
672       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
673     }
674     return NewPhi;
675   }
676 
677   default:
678     llvm_unreachable("Unsupported opcode for instruction");
679   }
680 }
681 
682 bool VPInstruction::isVectorToScalar() const {
683   return getOpcode() == VPInstruction::ExtractFromEnd ||
684          getOpcode() == VPInstruction::ComputeReductionResult;
685 }
686 
687 bool VPInstruction::isSingleScalar() const {
688   return getOpcode() == VPInstruction::ResumePhi;
689 }
690 
691 #if !defined(NDEBUG)
692 bool VPInstruction::isFPMathOp() const {
693   // Inspired by FPMathOperator::classof. Notable differences are that we don't
694   // support Call, PHI and Select opcodes here yet.
695   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
696          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
697          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
698          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
699 }
700 #endif
701 
702 void VPInstruction::execute(VPTransformState &State) {
703   assert(!State.Instance && "VPInstruction executing an Instance");
704   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
705   assert((hasFastMathFlags() == isFPMathOp() ||
706           getOpcode() == Instruction::Select) &&
707          "Recipe not a FPMathOp but has fast-math flags?");
708   if (hasFastMathFlags())
709     State.Builder.setFastMathFlags(getFastMathFlags());
710   State.setDebugLocFrom(getDebugLoc());
711   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
712                                    (vputils::onlyFirstLaneUsed(this) ||
713                                     isVectorToScalar() || isSingleScalar());
714   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
715   if (GeneratesPerAllLanes) {
716     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
717          Lane != NumLanes; ++Lane) {
718       Value *GeneratedValue = generatePerLane(State, VPIteration(0, Lane));
719       assert(GeneratedValue && "generatePerLane must produce a value");
720       State.set(this, GeneratedValue, VPIteration(0, Lane));
721     }
722     return;
723   }
724 
725   Value *GeneratedValue = generate(State);
726   if (!hasResult())
727     return;
728   assert(GeneratedValue && "generate must produce a value");
729   assert(
730       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
731        State.VF.isScalar()) &&
732       "scalar value but not only first lane defined");
733   State.set(this, GeneratedValue,
734             /*IsScalar*/ GeneratesPerFirstLaneOnly);
735 }
736 
737 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
738   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
739   if (Instruction::isBinaryOp(getOpcode()))
740     return vputils::onlyFirstLaneUsed(this);
741 
742   switch (getOpcode()) {
743   default:
744     return false;
745   case Instruction::ICmp:
746   case VPInstruction::PtrAdd:
747     // TODO: Cover additional opcodes.
748     return vputils::onlyFirstLaneUsed(this);
749   case VPInstruction::ActiveLaneMask:
750   case VPInstruction::ExplicitVectorLength:
751   case VPInstruction::CalculateTripCountMinusVF:
752   case VPInstruction::CanonicalIVIncrementForPart:
753   case VPInstruction::BranchOnCount:
754   case VPInstruction::BranchOnCond:
755   case VPInstruction::ResumePhi:
756     return true;
757   };
758   llvm_unreachable("switch should return");
759 }
760 
761 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
762   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
763   if (Instruction::isBinaryOp(getOpcode()))
764     return vputils::onlyFirstPartUsed(this);
765 
766   switch (getOpcode()) {
767   default:
768     return false;
769   case Instruction::ICmp:
770   case Instruction::Select:
771     return vputils::onlyFirstPartUsed(this);
772   case VPInstruction::BranchOnCount:
773   case VPInstruction::BranchOnCond:
774   case VPInstruction::CanonicalIVIncrementForPart:
775     return true;
776   };
777   llvm_unreachable("switch should return");
778 }
779 
780 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
781 void VPInstruction::dump() const {
782   VPSlotTracker SlotTracker(getParent()->getPlan());
783   print(dbgs(), "", SlotTracker);
784 }
785 
786 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
787                           VPSlotTracker &SlotTracker) const {
788   O << Indent << "EMIT ";
789 
790   if (hasResult()) {
791     printAsOperand(O, SlotTracker);
792     O << " = ";
793   }
794 
795   switch (getOpcode()) {
796   case VPInstruction::Not:
797     O << "not";
798     break;
799   case VPInstruction::SLPLoad:
800     O << "combined load";
801     break;
802   case VPInstruction::SLPStore:
803     O << "combined store";
804     break;
805   case VPInstruction::ActiveLaneMask:
806     O << "active lane mask";
807     break;
808   case VPInstruction::ResumePhi:
809     O << "resume-phi";
810     break;
811   case VPInstruction::ExplicitVectorLength:
812     O << "EXPLICIT-VECTOR-LENGTH";
813     break;
814   case VPInstruction::FirstOrderRecurrenceSplice:
815     O << "first-order splice";
816     break;
817   case VPInstruction::BranchOnCond:
818     O << "branch-on-cond";
819     break;
820   case VPInstruction::CalculateTripCountMinusVF:
821     O << "TC > VF ? TC - VF : 0";
822     break;
823   case VPInstruction::CanonicalIVIncrementForPart:
824     O << "VF * Part +";
825     break;
826   case VPInstruction::BranchOnCount:
827     O << "branch-on-count";
828     break;
829   case VPInstruction::ExtractFromEnd:
830     O << "extract-from-end";
831     break;
832   case VPInstruction::ComputeReductionResult:
833     O << "compute-reduction-result";
834     break;
835   case VPInstruction::LogicalAnd:
836     O << "logical-and";
837     break;
838   case VPInstruction::PtrAdd:
839     O << "ptradd";
840     break;
841   default:
842     O << Instruction::getOpcodeName(getOpcode());
843   }
844 
845   printFlags(O);
846   printOperands(O, SlotTracker);
847 
848   if (auto DL = getDebugLoc()) {
849     O << ", !dbg ";
850     DL.print(O);
851   }
852 }
853 #endif
854 
855 void VPIRInstruction::execute(VPTransformState &State) {
856   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
857          "Only PHINodes can have extra operands");
858   if (getNumOperands() == 1) {
859     VPValue *ExitValue = getOperand(0);
860     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
861                     ? VPLane::getFirstLane()
862                     : VPLane::getLastLaneForVF(State.VF);
863     auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
864     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
865     // Set insertion point in PredBB in case an extract needs to be generated.
866     // TODO: Model extracts explicitly.
867     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
868     Value *V = State.get(ExitValue, VPIteration(0, Lane));
869     auto *Phi = cast<PHINode>(&I);
870     Phi->addIncoming(V, PredBB);
871   }
872 
873   // Advance the insert point after the wrapped IR instruction. This allows
874   // interleaving VPIRInstructions and other recipes.
875   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
876 }
877 
878 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
879 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
880                             VPSlotTracker &SlotTracker) const {
881   O << Indent << "IR " << I;
882 
883   if (getNumOperands() != 0) {
884     assert(getNumOperands() == 1 && "can have at most 1 operand");
885     O << " (extra operand: ";
886     printOperands(O, SlotTracker);
887     O << ")";
888   }
889 }
890 #endif
891 
892 void VPWidenCallRecipe::execute(VPTransformState &State) {
893   assert(State.VF.isVector() && "not widening");
894   Function *CalledScalarFn = getCalledScalarFunction();
895   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
896          "DbgInfoIntrinsic should have been dropped during VPlan construction");
897   State.setDebugLocFrom(getDebugLoc());
898 
899   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
900   FunctionType *VFTy = nullptr;
901   if (Variant)
902     VFTy = Variant->getFunctionType();
903   SmallVector<Type *, 2> TysForDecl;
904   // Add return type if intrinsic is overloaded on it.
905   if (UseIntrinsic &&
906       isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
907     TysForDecl.push_back(VectorType::get(
908         CalledScalarFn->getReturnType()->getScalarType(), State.VF));
909   SmallVector<Value *, 4> Args;
910   for (const auto &I : enumerate(arg_operands())) {
911     // Some intrinsics have a scalar argument - don't replace it with a
912     // vector.
913     Value *Arg;
914     if (UseIntrinsic &&
915         isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
916       Arg = State.get(I.value(), VPIteration(0, 0));
917     // Some vectorized function variants may also take a scalar argument,
918     // e.g. linear parameters for pointers. This needs to be the scalar value
919     // from the start of the respective part when interleaving.
920     else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
921       Arg = State.get(I.value(), VPIteration(0, 0));
922     else
923       Arg = State.get(I.value());
924     if (UseIntrinsic &&
925         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
926       TysForDecl.push_back(Arg->getType());
927     Args.push_back(Arg);
928   }
929 
930   Function *VectorF;
931   if (UseIntrinsic) {
932     // Use vector version of the intrinsic.
933     Module *M = State.Builder.GetInsertBlock()->getModule();
934     VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
935     assert(VectorF && "Can't retrieve vector intrinsic.");
936   } else {
937 #ifndef NDEBUG
938     assert(Variant != nullptr && "Can't create vector function.");
939 #endif
940     VectorF = Variant;
941   }
942 
943   auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
944   SmallVector<OperandBundleDef, 1> OpBundles;
945   if (CI)
946     CI->getOperandBundlesAsDefs(OpBundles);
947 
948   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
949 
950   if (isa<FPMathOperator>(V))
951     V->copyFastMathFlags(CI);
952 
953   if (!V->getType()->isVoidTy())
954     State.set(this, V);
955   State.addMetadata(V, CI);
956 }
957 
958 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
959                                                VPCostContext &Ctx) const {
960   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
961   if (Variant) {
962     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
963                                     Variant->getFunctionType()->params(),
964                                     CostKind);
965   }
966 
967   FastMathFlags FMF;
968   // TODO: Manage flags via VPRecipeWithIRFlags.
969   if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
970     FMF = FPMO->getFastMathFlags();
971 
972   // Some backends analyze intrinsic arguments to determine cost. Use the
973   // underlying value for the operand if it has one. Otherwise try to use the
974   // operand of the underlying call instruction, if there is one. Otherwise
975   // clear Arguments.
976   // TODO: Rework TTI interface to be independent of concrete IR values.
977   SmallVector<const Value *> Arguments;
978   for (const auto &[Idx, Op] : enumerate(operands())) {
979     auto *V = Op->getUnderlyingValue();
980     if (!V) {
981       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
982         Arguments.push_back(UI->getArgOperand(Idx));
983         continue;
984       }
985       Arguments.clear();
986       break;
987     }
988     Arguments.push_back(V);
989   }
990 
991   Type *RetTy =
992       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
993   SmallVector<Type *> ParamTys;
994   for (unsigned I = 0; I != getNumOperands(); ++I)
995     ParamTys.push_back(
996         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
997 
998   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
999   IntrinsicCostAttributes CostAttrs(
1000       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1001       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1002   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
1003 }
1004 
1005 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1006 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1007                               VPSlotTracker &SlotTracker) const {
1008   O << Indent << "WIDEN-CALL ";
1009 
1010   Function *CalledFn = getCalledScalarFunction();
1011   if (CalledFn->getReturnType()->isVoidTy())
1012     O << "void ";
1013   else {
1014     printAsOperand(O, SlotTracker);
1015     O << " = ";
1016   }
1017 
1018   O << "call @" << CalledFn->getName() << "(";
1019   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1020     Op->printAsOperand(O, SlotTracker);
1021   });
1022   O << ")";
1023 
1024   if (VectorIntrinsicID)
1025     O << " (using vector intrinsic)";
1026   else {
1027     O << " (using library function";
1028     if (Variant->hasName())
1029       O << ": " << Variant->getName();
1030     O << ")";
1031   }
1032 }
1033 
1034 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1035                                 VPSlotTracker &SlotTracker) const {
1036   O << Indent << "WIDEN-SELECT ";
1037   printAsOperand(O, SlotTracker);
1038   O << " = select ";
1039   getOperand(0)->printAsOperand(O, SlotTracker);
1040   O << ", ";
1041   getOperand(1)->printAsOperand(O, SlotTracker);
1042   O << ", ";
1043   getOperand(2)->printAsOperand(O, SlotTracker);
1044   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1045 }
1046 #endif
1047 
1048 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1049   State.setDebugLocFrom(getDebugLoc());
1050 
1051   // The condition can be loop invariant but still defined inside the
1052   // loop. This means that we can't just use the original 'cond' value.
1053   // We have to take the 'vectorized' value and pick the first lane.
1054   // Instcombine will make this a no-op.
1055   auto *InvarCond =
1056       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
1057 
1058   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1059   Value *Op0 = State.get(getOperand(1));
1060   Value *Op1 = State.get(getOperand(2));
1061   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1062   State.set(this, Sel);
1063   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1064 }
1065 
1066 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1067     const FastMathFlags &FMF) {
1068   AllowReassoc = FMF.allowReassoc();
1069   NoNaNs = FMF.noNaNs();
1070   NoInfs = FMF.noInfs();
1071   NoSignedZeros = FMF.noSignedZeros();
1072   AllowReciprocal = FMF.allowReciprocal();
1073   AllowContract = FMF.allowContract();
1074   ApproxFunc = FMF.approxFunc();
1075 }
1076 
1077 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1078 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1079   switch (OpType) {
1080   case OperationType::Cmp:
1081     O << " " << CmpInst::getPredicateName(getPredicate());
1082     break;
1083   case OperationType::DisjointOp:
1084     if (DisjointFlags.IsDisjoint)
1085       O << " disjoint";
1086     break;
1087   case OperationType::PossiblyExactOp:
1088     if (ExactFlags.IsExact)
1089       O << " exact";
1090     break;
1091   case OperationType::OverflowingBinOp:
1092     if (WrapFlags.HasNUW)
1093       O << " nuw";
1094     if (WrapFlags.HasNSW)
1095       O << " nsw";
1096     break;
1097   case OperationType::FPMathOp:
1098     getFastMathFlags().print(O);
1099     break;
1100   case OperationType::GEPOp:
1101     if (GEPFlags.IsInBounds)
1102       O << " inbounds";
1103     break;
1104   case OperationType::NonNegOp:
1105     if (NonNegFlags.NonNeg)
1106       O << " nneg";
1107     break;
1108   case OperationType::Other:
1109     break;
1110   }
1111   if (getNumOperands() > 0)
1112     O << " ";
1113 }
1114 #endif
1115 
1116 void VPWidenRecipe::execute(VPTransformState &State) {
1117   State.setDebugLocFrom(getDebugLoc());
1118   auto &Builder = State.Builder;
1119   switch (Opcode) {
1120   case Instruction::Call:
1121   case Instruction::Br:
1122   case Instruction::PHI:
1123   case Instruction::GetElementPtr:
1124   case Instruction::Select:
1125     llvm_unreachable("This instruction is handled by a different recipe.");
1126   case Instruction::UDiv:
1127   case Instruction::SDiv:
1128   case Instruction::SRem:
1129   case Instruction::URem:
1130   case Instruction::Add:
1131   case Instruction::FAdd:
1132   case Instruction::Sub:
1133   case Instruction::FSub:
1134   case Instruction::FNeg:
1135   case Instruction::Mul:
1136   case Instruction::FMul:
1137   case Instruction::FDiv:
1138   case Instruction::FRem:
1139   case Instruction::Shl:
1140   case Instruction::LShr:
1141   case Instruction::AShr:
1142   case Instruction::And:
1143   case Instruction::Or:
1144   case Instruction::Xor: {
1145     // Just widen unops and binops.
1146     SmallVector<Value *, 2> Ops;
1147     for (VPValue *VPOp : operands())
1148       Ops.push_back(State.get(VPOp));
1149 
1150     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1151 
1152     if (auto *VecOp = dyn_cast<Instruction>(V))
1153       setFlags(VecOp);
1154 
1155     // Use this vector value for all users of the original instruction.
1156     State.set(this, V);
1157     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1158     break;
1159   }
1160   case Instruction::Freeze: {
1161     Value *Op = State.get(getOperand(0));
1162 
1163     Value *Freeze = Builder.CreateFreeze(Op);
1164     State.set(this, Freeze);
1165     break;
1166   }
1167   case Instruction::ICmp:
1168   case Instruction::FCmp: {
1169     // Widen compares. Generate vector compares.
1170     bool FCmp = Opcode == Instruction::FCmp;
1171     Value *A = State.get(getOperand(0));
1172     Value *B = State.get(getOperand(1));
1173     Value *C = nullptr;
1174     if (FCmp) {
1175       // Propagate fast math flags.
1176       IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1177       if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1178         Builder.setFastMathFlags(I->getFastMathFlags());
1179       C = Builder.CreateFCmp(getPredicate(), A, B);
1180     } else {
1181       C = Builder.CreateICmp(getPredicate(), A, B);
1182     }
1183     State.set(this, C);
1184     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1185     break;
1186   }
1187   default:
1188     // This instruction is not vectorized by simple widening.
1189     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1190                       << Instruction::getOpcodeName(Opcode));
1191     llvm_unreachable("Unhandled instruction!");
1192   } // end of switch.
1193 
1194 #if !defined(NDEBUG)
1195   // Verify that VPlan type inference results agree with the type of the
1196   // generated values.
1197   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1198              State.get(this)->getType() &&
1199          "inferred type and type from generated instructions do not match");
1200 #endif
1201 }
1202 
1203 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1204                                            VPCostContext &Ctx) const {
1205   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1206   switch (Opcode) {
1207   case Instruction::FNeg: {
1208     Type *VectorTy =
1209         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1210     return Ctx.TTI.getArithmeticInstrCost(
1211         Opcode, VectorTy, CostKind,
1212         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1213         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1214   }
1215 
1216   case Instruction::UDiv:
1217   case Instruction::SDiv:
1218   case Instruction::SRem:
1219   case Instruction::URem:
1220     // More complex computation, let the legacy cost-model handle this for now.
1221     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1222   case Instruction::Add:
1223   case Instruction::FAdd:
1224   case Instruction::Sub:
1225   case Instruction::FSub:
1226   case Instruction::Mul:
1227   case Instruction::FMul:
1228   case Instruction::FDiv:
1229   case Instruction::FRem:
1230   case Instruction::Shl:
1231   case Instruction::LShr:
1232   case Instruction::AShr:
1233   case Instruction::And:
1234   case Instruction::Or:
1235   case Instruction::Xor: {
1236     VPValue *RHS = getOperand(1);
1237     // Certain instructions can be cheaper to vectorize if they have a constant
1238     // second vector operand. One example of this are shifts on x86.
1239     TargetTransformInfo::OperandValueInfo RHSInfo = {
1240         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1241     if (RHS->isLiveIn())
1242       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1243 
1244     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1245         getOperand(1)->isDefinedOutsideLoopRegions())
1246       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1247     Type *VectorTy =
1248         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1249     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1250 
1251     SmallVector<const Value *, 4> Operands;
1252     if (CtxI)
1253       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1254     return Ctx.TTI.getArithmeticInstrCost(
1255         Opcode, VectorTy, CostKind,
1256         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1257         RHSInfo, Operands, CtxI, &Ctx.TLI);
1258   }
1259   case Instruction::Freeze: {
1260     // This opcode is unknown. Assume that it is the same as 'mul'.
1261     Type *VectorTy =
1262         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1263     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1264   }
1265   case Instruction::ICmp:
1266   case Instruction::FCmp: {
1267     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1268     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1269     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1270                                       CostKind, CtxI);
1271   }
1272   default:
1273     llvm_unreachable("Unsupported opcode for instruction");
1274   }
1275 }
1276 
1277 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1278   unsigned Opcode = getOpcode();
1279   // TODO: Support other opcodes
1280   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1281     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1282 
1283   State.setDebugLocFrom(getDebugLoc());
1284 
1285   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1286          "VPWidenEVLRecipe should not be used for scalars");
1287 
1288   VPValue *EVL = getEVL();
1289   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1290   IRBuilderBase &BuilderIR = State.Builder;
1291   VectorBuilder Builder(BuilderIR);
1292   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1293 
1294   SmallVector<Value *, 4> Ops;
1295   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1296     VPValue *VPOp = getOperand(I);
1297     Ops.push_back(State.get(VPOp));
1298   }
1299 
1300   Builder.setMask(Mask).setEVL(EVLArg);
1301   Value *VPInst =
1302       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1303   // Currently vp-intrinsics only accept FMF flags.
1304   // TODO: Enable other flags when support is added.
1305   if (isa<FPMathOperator>(VPInst))
1306     setFlags(cast<Instruction>(VPInst));
1307 
1308   State.set(this, VPInst);
1309   State.addMetadata(VPInst,
1310                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1311 }
1312 
1313 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1314 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1315                           VPSlotTracker &SlotTracker) const {
1316   O << Indent << "WIDEN ";
1317   printAsOperand(O, SlotTracker);
1318   O << " = " << Instruction::getOpcodeName(Opcode);
1319   printFlags(O);
1320   printOperands(O, SlotTracker);
1321 }
1322 
1323 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1324                              VPSlotTracker &SlotTracker) const {
1325   O << Indent << "WIDEN ";
1326   printAsOperand(O, SlotTracker);
1327   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1328   printFlags(O);
1329   printOperands(O, SlotTracker);
1330 }
1331 #endif
1332 
1333 void VPWidenCastRecipe::execute(VPTransformState &State) {
1334   State.setDebugLocFrom(getDebugLoc());
1335   auto &Builder = State.Builder;
1336   /// Vectorize casts.
1337   assert(State.VF.isVector() && "Not vectorizing?");
1338   Type *DestTy = VectorType::get(getResultType(), State.VF);
1339   VPValue *Op = getOperand(0);
1340   Value *A = State.get(Op);
1341   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1342   State.set(this, Cast);
1343   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1344 }
1345 
1346 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1347 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1348                               VPSlotTracker &SlotTracker) const {
1349   O << Indent << "WIDEN-CAST ";
1350   printAsOperand(O, SlotTracker);
1351   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1352   printFlags(O);
1353   printOperands(O, SlotTracker);
1354   O << " to " << *getResultType();
1355 }
1356 #endif
1357 
1358 /// This function adds
1359 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1360 /// to each vector element of Val. The sequence starts at StartIndex.
1361 /// \p Opcode is relevant for FP induction variable.
1362 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1363                             Instruction::BinaryOps BinOp, ElementCount VF,
1364                             IRBuilderBase &Builder) {
1365   assert(VF.isVector() && "only vector VFs are supported");
1366 
1367   // Create and check the types.
1368   auto *ValVTy = cast<VectorType>(Val->getType());
1369   ElementCount VLen = ValVTy->getElementCount();
1370 
1371   Type *STy = Val->getType()->getScalarType();
1372   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1373          "Induction Step must be an integer or FP");
1374   assert(Step->getType() == STy && "Step has wrong type");
1375 
1376   SmallVector<Constant *, 8> Indices;
1377 
1378   // Create a vector of consecutive numbers from zero to VF.
1379   VectorType *InitVecValVTy = ValVTy;
1380   if (STy->isFloatingPointTy()) {
1381     Type *InitVecValSTy =
1382         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1383     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1384   }
1385   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1386 
1387   // Splat the StartIdx
1388   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1389 
1390   if (STy->isIntegerTy()) {
1391     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1392     Step = Builder.CreateVectorSplat(VLen, Step);
1393     assert(Step->getType() == Val->getType() && "Invalid step vec");
1394     // FIXME: The newly created binary instructions should contain nsw/nuw
1395     // flags, which can be found from the original scalar operations.
1396     Step = Builder.CreateMul(InitVec, Step);
1397     return Builder.CreateAdd(Val, Step, "induction");
1398   }
1399 
1400   // Floating point induction.
1401   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1402          "Binary Opcode should be specified for FP induction");
1403   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1404   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1405 
1406   Step = Builder.CreateVectorSplat(VLen, Step);
1407   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1408   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1409 }
1410 
1411 /// A helper function that returns an integer or floating-point constant with
1412 /// value C.
1413 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1414   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1415                            : ConstantFP::get(Ty, C);
1416 }
1417 
1418 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1419   assert(!State.Instance && "Int or FP induction being replicated.");
1420 
1421   Value *Start = getStartValue()->getLiveInIRValue();
1422   const InductionDescriptor &ID = getInductionDescriptor();
1423   TruncInst *Trunc = getTruncInst();
1424   IRBuilderBase &Builder = State.Builder;
1425   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1426   assert(State.VF.isVector() && "must have vector VF");
1427 
1428   // The value from the original loop to which we are mapping the new induction
1429   // variable.
1430   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1431 
1432   // Fast-math-flags propagate from the original induction instruction.
1433   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1434   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1435     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1436 
1437   // Now do the actual transformations, and start with fetching the step value.
1438   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1439 
1440   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1441          "Expected either an induction phi-node or a truncate of it!");
1442 
1443   // Construct the initial value of the vector IV in the vector loop preheader
1444   auto CurrIP = Builder.saveIP();
1445   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1446   Builder.SetInsertPoint(VectorPH->getTerminator());
1447   if (isa<TruncInst>(EntryVal)) {
1448     assert(Start->getType()->isIntegerTy() &&
1449            "Truncation requires an integer type");
1450     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1451     Step = Builder.CreateTrunc(Step, TruncType);
1452     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1453   }
1454 
1455   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1456   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1457   Value *SteppedStart = getStepVector(
1458       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1459 
1460   // We create vector phi nodes for both integer and floating-point induction
1461   // variables. Here, we determine the kind of arithmetic we will perform.
1462   Instruction::BinaryOps AddOp;
1463   Instruction::BinaryOps MulOp;
1464   if (Step->getType()->isIntegerTy()) {
1465     AddOp = Instruction::Add;
1466     MulOp = Instruction::Mul;
1467   } else {
1468     AddOp = ID.getInductionOpcode();
1469     MulOp = Instruction::FMul;
1470   }
1471 
1472   Value *SplatVF;
1473   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1474     // The recipe has been unrolled. In that case, fetch the splat value for the
1475     // induction increment.
1476     SplatVF = State.get(SplatVFOperand);
1477   } else {
1478     // Multiply the vectorization factor by the step using integer or
1479     // floating-point arithmetic as appropriate.
1480     Type *StepType = Step->getType();
1481     Value *RuntimeVF = State.get(getVFValue(), {0, 0});
1482     if (Step->getType()->isFloatingPointTy())
1483       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1484     else
1485       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1486     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1487 
1488     // Create a vector splat to use in the induction update.
1489     //
1490     // FIXME: If the step is non-constant, we create the vector splat with
1491     //        IRBuilder. IRBuilder can constant-fold the multiply, but it
1492     //        doesn't handle a constant vector splat.
1493     SplatVF = isa<Constant>(Mul)
1494                   ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1495                   : Builder.CreateVectorSplat(State.VF, Mul);
1496   }
1497 
1498   Builder.restoreIP(CurrIP);
1499 
1500   // We may need to add the step a number of times, depending on the unroll
1501   // factor. The last of those goes into the PHI.
1502   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1503   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1504   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1505   State.set(this, VecInd);
1506 
1507   Instruction *LastInduction = cast<Instruction>(
1508       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1509   if (isa<TruncInst>(EntryVal))
1510     State.addMetadata(LastInduction, EntryVal);
1511   LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1512 
1513   VecInd->addIncoming(SteppedStart, VectorPH);
1514   // Add induction update using an incorrect block temporarily. The phi node
1515   // will be fixed after VPlan execution. Note that at this point the latch
1516   // block cannot be used, as it does not exist yet.
1517   // TODO: Model increment value in VPlan, by turning the recipe into a
1518   // multi-def and a subclass of VPHeaderPHIRecipe.
1519   VecInd->addIncoming(LastInduction, VectorPH);
1520 }
1521 
1522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1523 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1524                                           VPSlotTracker &SlotTracker) const {
1525   O << Indent << "WIDEN-INDUCTION";
1526   if (getTruncInst()) {
1527     O << "\\l\"";
1528     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1529     O << " +\n" << Indent << "\"  ";
1530     getVPValue(0)->printAsOperand(O, SlotTracker);
1531   } else
1532     O << " " << VPlanIngredient(IV);
1533 
1534   O << ", ";
1535   getStepValue()->printAsOperand(O, SlotTracker);
1536 
1537   O << ", ";
1538   getVFValue()->printAsOperand(O, SlotTracker);
1539 }
1540 #endif
1541 
1542 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1543   // The step may be defined by a recipe in the preheader (e.g. if it requires
1544   // SCEV expansion), but for the canonical induction the step is required to be
1545   // 1, which is represented as live-in.
1546   if (getStepValue()->getDefiningRecipe())
1547     return false;
1548   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1549   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1550   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1551   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1552          getScalarType() == CanIV->getScalarType();
1553 }
1554 
1555 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1556 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1557                               VPSlotTracker &SlotTracker) const {
1558   O << Indent;
1559   printAsOperand(O, SlotTracker);
1560   O << " = DERIVED-IV ";
1561   getStartValue()->printAsOperand(O, SlotTracker);
1562   O << " + ";
1563   getOperand(1)->printAsOperand(O, SlotTracker);
1564   O << " * ";
1565   getStepValue()->printAsOperand(O, SlotTracker);
1566 }
1567 #endif
1568 
1569 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1570   // Fast-math-flags propagate from the original induction instruction.
1571   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1572   if (hasFastMathFlags())
1573     State.Builder.setFastMathFlags(getFastMathFlags());
1574 
1575   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1576   /// variable on which to base the steps, \p Step is the size of the step.
1577 
1578   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1579   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1580   IRBuilderBase &Builder = State.Builder;
1581 
1582   // Ensure step has the same type as that of scalar IV.
1583   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1584   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1585 
1586   // We build scalar steps for both integer and floating-point induction
1587   // variables. Here, we determine the kind of arithmetic we will perform.
1588   Instruction::BinaryOps AddOp;
1589   Instruction::BinaryOps MulOp;
1590   if (BaseIVTy->isIntegerTy()) {
1591     AddOp = Instruction::Add;
1592     MulOp = Instruction::Mul;
1593   } else {
1594     AddOp = InductionOpcode;
1595     MulOp = Instruction::FMul;
1596   }
1597 
1598   // Determine the number of scalars we need to generate for each unroll
1599   // iteration.
1600   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1601   // Compute the scalar steps and save the results in State.
1602   Type *IntStepTy =
1603       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1604   Type *VecIVTy = nullptr;
1605   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1606   if (!FirstLaneOnly && State.VF.isScalable()) {
1607     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1608     UnitStepVec =
1609         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1610     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1611     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1612   }
1613 
1614   unsigned StartLane = 0;
1615   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1616   if (State.Instance) {
1617     StartLane = State.Instance->Lane.getKnownLane();
1618     EndLane = StartLane + 1;
1619   }
1620   Value *StartIdx0 =
1621       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1622 
1623   if (!FirstLaneOnly && State.VF.isScalable()) {
1624     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1625     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1626     if (BaseIVTy->isFloatingPointTy())
1627       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1628     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1629     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1630     State.set(this, Add);
1631     // It's useful to record the lane values too for the known minimum number
1632     // of elements so we do those below. This improves the code quality when
1633     // trying to extract the first element, for example.
1634   }
1635 
1636   if (BaseIVTy->isFloatingPointTy())
1637     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1638 
1639   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1640     Value *StartIdx = Builder.CreateBinOp(
1641         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1642     // The step returned by `createStepForVF` is a runtime-evaluated value
1643     // when VF is scalable. Otherwise, it should be folded into a Constant.
1644     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1645            "Expected StartIdx to be folded to a constant when VF is not "
1646            "scalable");
1647     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1648     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1649     State.set(this, Add, VPIteration(0, Lane));
1650   }
1651 }
1652 
1653 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1654 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1655                                   VPSlotTracker &SlotTracker) const {
1656   O << Indent;
1657   printAsOperand(O, SlotTracker);
1658   O << " = SCALAR-STEPS ";
1659   printOperands(O, SlotTracker);
1660 }
1661 #endif
1662 
1663 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1664   assert(State.VF.isVector() && "not widening");
1665   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1666   // Construct a vector GEP by widening the operands of the scalar GEP as
1667   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1668   // results in a vector of pointers when at least one operand of the GEP
1669   // is vector-typed. Thus, to keep the representation compact, we only use
1670   // vector-typed operands for loop-varying values.
1671 
1672   if (areAllOperandsInvariant()) {
1673     // If we are vectorizing, but the GEP has only loop-invariant operands,
1674     // the GEP we build (by only using vector-typed operands for
1675     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1676     // produce a vector of pointers, we need to either arbitrarily pick an
1677     // operand to broadcast, or broadcast a clone of the original GEP.
1678     // Here, we broadcast a clone of the original.
1679     //
1680     // TODO: If at some point we decide to scalarize instructions having
1681     //       loop-invariant operands, this special case will no longer be
1682     //       required. We would add the scalarization decision to
1683     //       collectLoopScalars() and teach getVectorValue() to broadcast
1684     //       the lane-zero scalar value.
1685     SmallVector<Value *> Ops;
1686     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1687       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1688 
1689     auto *NewGEP =
1690         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1691                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1692     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1693     State.set(this, Splat);
1694     State.addMetadata(Splat, GEP);
1695   } else {
1696     // If the GEP has at least one loop-varying operand, we are sure to
1697     // produce a vector of pointers unless VF is scalar.
1698     // The pointer operand of the new GEP. If it's loop-invariant, we
1699     // won't broadcast it.
1700     auto *Ptr = isPointerLoopInvariant()
1701                     ? State.get(getOperand(0), VPIteration(0, 0))
1702                     : State.get(getOperand(0));
1703 
1704     // Collect all the indices for the new GEP. If any index is
1705     // loop-invariant, we won't broadcast it.
1706     SmallVector<Value *, 4> Indices;
1707     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1708       VPValue *Operand = getOperand(I);
1709       if (isIndexLoopInvariant(I - 1))
1710         Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1711       else
1712         Indices.push_back(State.get(Operand));
1713     }
1714 
1715     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1716     // but it should be a vector, otherwise.
1717     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1718                                            Indices, "", isInBounds());
1719     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1720            "NewGEP is not a pointer vector");
1721     State.set(this, NewGEP);
1722     State.addMetadata(NewGEP, GEP);
1723   }
1724 }
1725 
1726 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1727 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1728                              VPSlotTracker &SlotTracker) const {
1729   O << Indent << "WIDEN-GEP ";
1730   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1731   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1732     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1733 
1734   O << " ";
1735   printAsOperand(O, SlotTracker);
1736   O << " = getelementptr";
1737   printFlags(O);
1738   printOperands(O, SlotTracker);
1739 }
1740 #endif
1741 
1742 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1743   auto &Builder = State.Builder;
1744   State.setDebugLocFrom(getDebugLoc());
1745   unsigned CurrentPart = getUnrollPart(*this);
1746   // Use i32 for the gep index type when the value is constant,
1747   // or query DataLayout for a more suitable index type otherwise.
1748   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
1749   Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0)
1750                       ? DL.getIndexType(IndexedTy->getPointerTo())
1751                       : Builder.getInt32Ty();
1752   Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1753   bool InBounds = isInBounds();
1754 
1755   Value *ResultPtr = nullptr;
1756   if (IsReverse) {
1757     // If the address is consecutive but reversed, then the
1758     // wide store needs to start at the last vector element.
1759     // RunTimeVF =  VScale * VF.getKnownMinValue()
1760     // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1761     Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1762     // NumElt = -CurrentPart * RunTimeVF
1763     Value *NumElt = Builder.CreateMul(
1764         ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
1765     // LastLane = 1 - RunTimeVF
1766     Value *LastLane =
1767         Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1768     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1769     ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds);
1770   } else {
1771     Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
1772     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1773   }
1774 
1775   State.set(this, ResultPtr, /*IsScalar*/ true);
1776 }
1777 
1778 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1779 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1780                                   VPSlotTracker &SlotTracker) const {
1781   O << Indent;
1782   printAsOperand(O, SlotTracker);
1783   O << " = vector-pointer ";
1784   if (IsReverse)
1785     O << "(reverse) ";
1786 
1787   printOperands(O, SlotTracker);
1788 }
1789 #endif
1790 
1791 void VPBlendRecipe::execute(VPTransformState &State) {
1792   assert(isNormalized() && "Expected blend to be normalized!");
1793   State.setDebugLocFrom(getDebugLoc());
1794   // We know that all PHIs in non-header blocks are converted into
1795   // selects, so we don't have to worry about the insertion order and we
1796   // can just use the builder.
1797   // At this point we generate the predication tree. There may be
1798   // duplications since this is a simple recursive scan, but future
1799   // optimizations will clean it up.
1800 
1801   unsigned NumIncoming = getNumIncomingValues();
1802 
1803   // Generate a sequence of selects of the form:
1804   // SELECT(Mask3, In3,
1805   //        SELECT(Mask2, In2,
1806   //               SELECT(Mask1, In1,
1807   //                      In0)))
1808   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1809   // are essentially undef are taken from In0.
1810   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1811   Value *Result = nullptr;
1812   for (unsigned In = 0; In < NumIncoming; ++In) {
1813     // We might have single edge PHIs (blocks) - use an identity
1814     // 'select' for the first PHI operand.
1815     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
1816     if (In == 0)
1817       Result = In0; // Initialize with the first incoming value.
1818     else {
1819       // Select between the current value and the previous incoming edge
1820       // based on the incoming mask.
1821       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
1822       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
1823     }
1824   }
1825   State.set(this, Result, OnlyFirstLaneUsed);
1826 }
1827 
1828 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1829 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1830                           VPSlotTracker &SlotTracker) const {
1831   O << Indent << "BLEND ";
1832   printAsOperand(O, SlotTracker);
1833   O << " =";
1834   if (getNumIncomingValues() == 1) {
1835     // Not a User of any mask: not really blending, this is a
1836     // single-predecessor phi.
1837     O << " ";
1838     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1839   } else {
1840     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1841       O << " ";
1842       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1843       if (I == 0)
1844         continue;
1845       O << "/";
1846       getMask(I)->printAsOperand(O, SlotTracker);
1847     }
1848   }
1849 }
1850 #endif
1851 
1852 void VPReductionRecipe::execute(VPTransformState &State) {
1853   assert(!State.Instance && "Reduction being replicated.");
1854   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1855   RecurKind Kind = RdxDesc.getRecurrenceKind();
1856   // Propagate the fast-math flags carried by the underlying instruction.
1857   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1858   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1859   Value *NewVecOp = State.get(getVecOp());
1860   if (VPValue *Cond = getCondOp()) {
1861     Value *NewCond = State.get(Cond, State.VF.isScalar());
1862     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1863     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1864 
1865     Value *Start;
1866     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1867       Start = RdxDesc.getRecurrenceStartValue();
1868     else
1869       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1870                                           RdxDesc.getFastMathFlags());
1871     if (State.VF.isVector())
1872       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
1873 
1874     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1875     NewVecOp = Select;
1876   }
1877   Value *NewRed;
1878   Value *NextInChain;
1879   if (IsOrdered) {
1880     if (State.VF.isVector())
1881       NewRed =
1882           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
1883     else
1884       NewRed = State.Builder.CreateBinOp(
1885           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1886           NewVecOp);
1887     PrevInChain = NewRed;
1888     NextInChain = NewRed;
1889   } else {
1890     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1891     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1892     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1893       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1894                                    NewRed, PrevInChain);
1895     else
1896       NextInChain = State.Builder.CreateBinOp(
1897           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1898   }
1899   State.set(this, NextInChain, /*IsScalar*/ true);
1900 }
1901 
1902 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1903   assert(!State.Instance && "Reduction being replicated.");
1904 
1905   auto &Builder = State.Builder;
1906   // Propagate the fast-math flags carried by the underlying instruction.
1907   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1908   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1909   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1910 
1911   RecurKind Kind = RdxDesc.getRecurrenceKind();
1912   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
1913   Value *VecOp = State.get(getVecOp());
1914   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1915 
1916   VectorBuilder VBuilder(Builder);
1917   VBuilder.setEVL(EVL);
1918   Value *Mask;
1919   // TODO: move the all-true mask generation into VectorBuilder.
1920   if (VPValue *CondOp = getCondOp())
1921     Mask = State.get(CondOp);
1922   else
1923     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1924   VBuilder.setMask(Mask);
1925 
1926   Value *NewRed;
1927   if (isOrdered()) {
1928     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1929   } else {
1930     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
1931     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1932       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1933     else
1934       NewRed = Builder.CreateBinOp(
1935           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1936   }
1937   State.set(this, NewRed, /*IsScalar*/ true);
1938 }
1939 
1940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1941 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1942                               VPSlotTracker &SlotTracker) const {
1943   O << Indent << "REDUCE ";
1944   printAsOperand(O, SlotTracker);
1945   O << " = ";
1946   getChainOp()->printAsOperand(O, SlotTracker);
1947   O << " +";
1948   if (isa<FPMathOperator>(getUnderlyingInstr()))
1949     O << getUnderlyingInstr()->getFastMathFlags();
1950   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1951   getVecOp()->printAsOperand(O, SlotTracker);
1952   if (isConditional()) {
1953     O << ", ";
1954     getCondOp()->printAsOperand(O, SlotTracker);
1955   }
1956   O << ")";
1957   if (RdxDesc.IntermediateStore)
1958     O << " (with final reduction value stored in invariant address sank "
1959          "outside of loop)";
1960 }
1961 
1962 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1963                                  VPSlotTracker &SlotTracker) const {
1964   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1965   O << Indent << "REDUCE ";
1966   printAsOperand(O, SlotTracker);
1967   O << " = ";
1968   getChainOp()->printAsOperand(O, SlotTracker);
1969   O << " +";
1970   if (isa<FPMathOperator>(getUnderlyingInstr()))
1971     O << getUnderlyingInstr()->getFastMathFlags();
1972   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1973   getVecOp()->printAsOperand(O, SlotTracker);
1974   O << ", ";
1975   getEVL()->printAsOperand(O, SlotTracker);
1976   if (isConditional()) {
1977     O << ", ";
1978     getCondOp()->printAsOperand(O, SlotTracker);
1979   }
1980   O << ")";
1981   if (RdxDesc.IntermediateStore)
1982     O << " (with final reduction value stored in invariant address sank "
1983          "outside of loop)";
1984 }
1985 #endif
1986 
1987 bool VPReplicateRecipe::shouldPack() const {
1988   // Find if the recipe is used by a widened recipe via an intervening
1989   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1990   return any_of(users(), [](const VPUser *U) {
1991     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1992       return any_of(PredR->users(), [PredR](const VPUser *U) {
1993         return !U->usesScalars(PredR);
1994       });
1995     return false;
1996   });
1997 }
1998 
1999 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2000 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2001                               VPSlotTracker &SlotTracker) const {
2002   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2003 
2004   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2005     printAsOperand(O, SlotTracker);
2006     O << " = ";
2007   }
2008   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2009     O << "call";
2010     printFlags(O);
2011     O << "@" << CB->getCalledFunction()->getName() << "(";
2012     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2013                     O, [&O, &SlotTracker](VPValue *Op) {
2014                       Op->printAsOperand(O, SlotTracker);
2015                     });
2016     O << ")";
2017   } else {
2018     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2019     printFlags(O);
2020     printOperands(O, SlotTracker);
2021   }
2022 
2023   if (shouldPack())
2024     O << " (S->V)";
2025 }
2026 #endif
2027 
2028 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2029   assert(vputils::onlyFirstLaneUsed(this) &&
2030          "Codegen only implemented for first lane.");
2031   switch (Opcode) {
2032   case Instruction::SExt:
2033   case Instruction::ZExt:
2034   case Instruction::Trunc: {
2035     // Note: SExt/ZExt not used yet.
2036     Value *Op = State.get(getOperand(0), VPIteration(0, 0));
2037     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2038   }
2039   default:
2040     llvm_unreachable("opcode not implemented yet");
2041   }
2042 }
2043 
2044 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2045   State.set(this, generate(State), VPIteration(0, 0));
2046 }
2047 
2048 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2049 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2050                                 VPSlotTracker &SlotTracker) const {
2051   O << Indent << "SCALAR-CAST ";
2052   printAsOperand(O, SlotTracker);
2053   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2054   printOperands(O, SlotTracker);
2055   O << " to " << *ResultTy;
2056 }
2057 #endif
2058 
2059 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2060   assert(State.Instance && "Branch on Mask works only on single instance.");
2061 
2062   unsigned Lane = State.Instance->Lane.getKnownLane();
2063 
2064   Value *ConditionBit = nullptr;
2065   VPValue *BlockInMask = getMask();
2066   if (BlockInMask) {
2067     ConditionBit = State.get(BlockInMask);
2068     if (ConditionBit->getType()->isVectorTy())
2069       ConditionBit = State.Builder.CreateExtractElement(
2070           ConditionBit, State.Builder.getInt32(Lane));
2071   } else // Block in mask is all-one.
2072     ConditionBit = State.Builder.getTrue();
2073 
2074   // Replace the temporary unreachable terminator with a new conditional branch,
2075   // whose two destinations will be set later when they are created.
2076   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2077   assert(isa<UnreachableInst>(CurrentTerminator) &&
2078          "Expected to replace unreachable terminator with conditional branch.");
2079   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2080   CondBr->setSuccessor(0, nullptr);
2081   ReplaceInstWithInst(CurrentTerminator, CondBr);
2082 }
2083 
2084 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2085   assert(State.Instance && "Predicated instruction PHI works per instance.");
2086   Instruction *ScalarPredInst =
2087       cast<Instruction>(State.get(getOperand(0), *State.Instance));
2088   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2089   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2090   assert(PredicatingBB && "Predicated block has no single predecessor.");
2091   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2092          "operand must be VPReplicateRecipe");
2093 
2094   // By current pack/unpack logic we need to generate only a single phi node: if
2095   // a vector value for the predicated instruction exists at this point it means
2096   // the instruction has vector users only, and a phi for the vector value is
2097   // needed. In this case the recipe of the predicated instruction is marked to
2098   // also do that packing, thereby "hoisting" the insert-element sequence.
2099   // Otherwise, a phi node for the scalar value is needed.
2100   if (State.hasVectorValue(getOperand(0))) {
2101     Value *VectorValue = State.get(getOperand(0));
2102     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2103     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2104     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2105     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2106     if (State.hasVectorValue(this))
2107       State.reset(this, VPhi);
2108     else
2109       State.set(this, VPhi);
2110     // NOTE: Currently we need to update the value of the operand, so the next
2111     // predicated iteration inserts its generated value in the correct vector.
2112     State.reset(getOperand(0), VPhi);
2113   } else {
2114     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2115     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2116     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2117                      PredicatingBB);
2118     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2119     if (State.hasScalarValue(this, *State.Instance))
2120       State.reset(this, Phi, *State.Instance);
2121     else
2122       State.set(this, Phi, *State.Instance);
2123     // NOTE: Currently we need to update the value of the operand, so the next
2124     // predicated iteration inserts its generated value in the correct vector.
2125     State.reset(getOperand(0), Phi, *State.Instance);
2126   }
2127 }
2128 
2129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2130 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2131                                 VPSlotTracker &SlotTracker) const {
2132   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2133   printAsOperand(O, SlotTracker);
2134   O << " = ";
2135   printOperands(O, SlotTracker);
2136 }
2137 #endif
2138 
2139 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2140                                                  VPCostContext &Ctx) const {
2141   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2142   const Align Alignment =
2143       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2144   unsigned AS =
2145       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2146   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2147 
2148   if (!Consecutive) {
2149     // TODO: Using the original IR may not be accurate.
2150     // Currently, ARM will use the underlying IR to calculate gather/scatter
2151     // instruction cost.
2152     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2153     assert(!Reverse &&
2154            "Inconsecutive memory access should not have the order.");
2155     return Ctx.TTI.getAddressComputationCost(Ty) +
2156            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2157                                           IsMasked, Alignment, CostKind,
2158                                           &Ingredient);
2159   }
2160 
2161   InstructionCost Cost = 0;
2162   if (IsMasked) {
2163     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2164                                           AS, CostKind);
2165   } else {
2166     TTI::OperandValueInfo OpInfo =
2167         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2168     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2169                                     CostKind, OpInfo, &Ingredient);
2170   }
2171   if (!Reverse)
2172     return Cost;
2173 
2174   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2175                                         cast<VectorType>(Ty), {}, CostKind, 0);
2176 }
2177 
2178 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2179   auto *LI = cast<LoadInst>(&Ingredient);
2180 
2181   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2182   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2183   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2184   bool CreateGather = !isConsecutive();
2185 
2186   auto &Builder = State.Builder;
2187   State.setDebugLocFrom(getDebugLoc());
2188   Value *Mask = nullptr;
2189   if (auto *VPMask = getMask()) {
2190     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2191     // of a null all-one mask is a null mask.
2192     Mask = State.get(VPMask);
2193     if (isReverse())
2194       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2195   }
2196 
2197   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2198   Value *NewLI;
2199   if (CreateGather) {
2200     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2201                                        "wide.masked.gather");
2202   } else if (Mask) {
2203     NewLI =
2204         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2205                                  PoisonValue::get(DataTy), "wide.masked.load");
2206   } else {
2207     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2208   }
2209   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2210   State.addMetadata(NewLI, LI);
2211   if (Reverse)
2212     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2213   State.set(this, NewLI);
2214 }
2215 
2216 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2217 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2218                               VPSlotTracker &SlotTracker) const {
2219   O << Indent << "WIDEN ";
2220   printAsOperand(O, SlotTracker);
2221   O << " = load ";
2222   printOperands(O, SlotTracker);
2223 }
2224 #endif
2225 
2226 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2227 /// dependence w/o affecting the result.
2228 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2229                                      Value *EVL, const Twine &Name) {
2230   VectorType *ValTy = cast<VectorType>(Operand->getType());
2231   Value *AllTrueMask =
2232       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2233   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2234                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2235 }
2236 
2237 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2238   auto *LI = cast<LoadInst>(&Ingredient);
2239 
2240   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2241   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2242   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2243   bool CreateGather = !isConsecutive();
2244 
2245   auto &Builder = State.Builder;
2246   State.setDebugLocFrom(getDebugLoc());
2247   CallInst *NewLI;
2248   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2249   Value *Addr = State.get(getAddr(), !CreateGather);
2250   Value *Mask = nullptr;
2251   if (VPValue *VPMask = getMask()) {
2252     Mask = State.get(VPMask);
2253     if (isReverse())
2254       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2255   } else {
2256     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2257   }
2258 
2259   if (CreateGather) {
2260     NewLI =
2261         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2262                                 nullptr, "wide.masked.gather");
2263   } else {
2264     VectorBuilder VBuilder(Builder);
2265     VBuilder.setEVL(EVL).setMask(Mask);
2266     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2267         Instruction::Load, DataTy, Addr, "vp.op.load"));
2268   }
2269   NewLI->addParamAttr(
2270       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2271   State.addMetadata(NewLI, LI);
2272   Instruction *Res = NewLI;
2273   if (isReverse())
2274     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2275   State.set(this, Res);
2276 }
2277 
2278 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2279 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2280                                  VPSlotTracker &SlotTracker) const {
2281   O << Indent << "WIDEN ";
2282   printAsOperand(O, SlotTracker);
2283   O << " = vp.load ";
2284   printOperands(O, SlotTracker);
2285 }
2286 #endif
2287 
2288 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2289   auto *SI = cast<StoreInst>(&Ingredient);
2290 
2291   VPValue *StoredVPValue = getStoredValue();
2292   bool CreateScatter = !isConsecutive();
2293   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2294 
2295   auto &Builder = State.Builder;
2296   State.setDebugLocFrom(getDebugLoc());
2297 
2298   Value *Mask = nullptr;
2299   if (auto *VPMask = getMask()) {
2300     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2301     // of a null all-one mask is a null mask.
2302     Mask = State.get(VPMask);
2303     if (isReverse())
2304       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2305   }
2306 
2307   Value *StoredVal = State.get(StoredVPValue);
2308   if (isReverse()) {
2309     // If we store to reverse consecutive memory locations, then we need
2310     // to reverse the order of elements in the stored value.
2311     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2312     // We don't want to update the value in the map as it might be used in
2313     // another expression. So don't call resetVectorValue(StoredVal).
2314   }
2315   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2316   Instruction *NewSI = nullptr;
2317   if (CreateScatter)
2318     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2319   else if (Mask)
2320     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2321   else
2322     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2323   State.addMetadata(NewSI, SI);
2324 }
2325 
2326 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2327 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2328                                VPSlotTracker &SlotTracker) const {
2329   O << Indent << "WIDEN store ";
2330   printOperands(O, SlotTracker);
2331 }
2332 #endif
2333 
2334 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2335   auto *SI = cast<StoreInst>(&Ingredient);
2336 
2337   VPValue *StoredValue = getStoredValue();
2338   bool CreateScatter = !isConsecutive();
2339   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2340 
2341   auto &Builder = State.Builder;
2342   State.setDebugLocFrom(getDebugLoc());
2343 
2344   CallInst *NewSI = nullptr;
2345   Value *StoredVal = State.get(StoredValue);
2346   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2347   if (isReverse())
2348     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2349   Value *Mask = nullptr;
2350   if (VPValue *VPMask = getMask()) {
2351     Mask = State.get(VPMask);
2352     if (isReverse())
2353       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2354   } else {
2355     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2356   }
2357   Value *Addr = State.get(getAddr(), !CreateScatter);
2358   if (CreateScatter) {
2359     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2360                                     Intrinsic::vp_scatter,
2361                                     {StoredVal, Addr, Mask, EVL});
2362   } else {
2363     VectorBuilder VBuilder(Builder);
2364     VBuilder.setEVL(EVL).setMask(Mask);
2365     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2366         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2367         {StoredVal, Addr}));
2368   }
2369   NewSI->addParamAttr(
2370       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2371   State.addMetadata(NewSI, SI);
2372 }
2373 
2374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2375 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2376                                   VPSlotTracker &SlotTracker) const {
2377   O << Indent << "WIDEN vp.store ";
2378   printOperands(O, SlotTracker);
2379 }
2380 #endif
2381 
2382 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2383                                      VectorType *DstVTy, const DataLayout &DL) {
2384   // Verify that V is a vector type with same number of elements as DstVTy.
2385   auto VF = DstVTy->getElementCount();
2386   auto *SrcVecTy = cast<VectorType>(V->getType());
2387   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2388   Type *SrcElemTy = SrcVecTy->getElementType();
2389   Type *DstElemTy = DstVTy->getElementType();
2390   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2391          "Vector elements must have same size");
2392 
2393   // Do a direct cast if element types are castable.
2394   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2395     return Builder.CreateBitOrPointerCast(V, DstVTy);
2396   }
2397   // V cannot be directly casted to desired vector type.
2398   // May happen when V is a floating point vector but DstVTy is a vector of
2399   // pointers or vice-versa. Handle this using a two-step bitcast using an
2400   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2401   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2402          "Only one type should be a pointer type");
2403   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2404          "Only one type should be a floating point type");
2405   Type *IntTy =
2406       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2407   auto *VecIntTy = VectorType::get(IntTy, VF);
2408   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2409   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2410 }
2411 
2412 /// Return a vector containing interleaved elements from multiple
2413 /// smaller input vectors.
2414 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2415                                 const Twine &Name) {
2416   unsigned Factor = Vals.size();
2417   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2418 
2419   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2420 #ifndef NDEBUG
2421   for (Value *Val : Vals)
2422     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2423 #endif
2424 
2425   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2426   // must use intrinsics to interleave.
2427   if (VecTy->isScalableTy()) {
2428     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2429     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2430                                    Vals,
2431                                    /*FMFSource=*/nullptr, Name);
2432   }
2433 
2434   // Fixed length. Start by concatenating all vectors into a wide vector.
2435   Value *WideVec = concatenateVectors(Builder, Vals);
2436 
2437   // Interleave the elements into the wide vector.
2438   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2439   return Builder.CreateShuffleVector(
2440       WideVec, createInterleaveMask(NumElts, Factor), Name);
2441 }
2442 
2443 // Try to vectorize the interleave group that \p Instr belongs to.
2444 //
2445 // E.g. Translate following interleaved load group (factor = 3):
2446 //   for (i = 0; i < N; i+=3) {
2447 //     R = Pic[i];             // Member of index 0
2448 //     G = Pic[i+1];           // Member of index 1
2449 //     B = Pic[i+2];           // Member of index 2
2450 //     ... // do something to R, G, B
2451 //   }
2452 // To:
2453 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2454 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2455 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2456 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2457 //
2458 // Or translate following interleaved store group (factor = 3):
2459 //   for (i = 0; i < N; i+=3) {
2460 //     ... do something to R, G, B
2461 //     Pic[i]   = R;           // Member of index 0
2462 //     Pic[i+1] = G;           // Member of index 1
2463 //     Pic[i+2] = B;           // Member of index 2
2464 //   }
2465 // To:
2466 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2467 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2468 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2469 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2470 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2471 void VPInterleaveRecipe::execute(VPTransformState &State) {
2472   assert(!State.Instance && "Interleave group being replicated.");
2473   const InterleaveGroup<Instruction> *Group = IG;
2474   Instruction *Instr = Group->getInsertPos();
2475 
2476   // Prepare for the vector type of the interleaved load/store.
2477   Type *ScalarTy = getLoadStoreType(Instr);
2478   unsigned InterleaveFactor = Group->getFactor();
2479   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2480 
2481   // Prepare for the new pointers.
2482   unsigned Index = Group->getIndex(Instr);
2483 
2484   // TODO: extend the masked interleaved-group support to reversed access.
2485   VPValue *BlockInMask = getMask();
2486   assert((!BlockInMask || !Group->isReverse()) &&
2487          "Reversed masked interleave-group not supported.");
2488 
2489   Value *Idx;
2490   // If the group is reverse, adjust the index to refer to the last vector lane
2491   // instead of the first. We adjust the index from the first vector lane,
2492   // rather than directly getting the pointer for lane VF - 1, because the
2493   // pointer operand of the interleaved access is supposed to be uniform.
2494   if (Group->isReverse()) {
2495     Value *RuntimeVF =
2496         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2497     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2498     Idx = State.Builder.CreateMul(Idx,
2499                                   State.Builder.getInt32(Group->getFactor()));
2500     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2501     Idx = State.Builder.CreateNeg(Idx);
2502   } else
2503     Idx = State.Builder.getInt32(-Index);
2504 
2505   VPValue *Addr = getAddr();
2506   Value *ResAddr = State.get(Addr, VPIteration(0, 0));
2507   if (auto *I = dyn_cast<Instruction>(ResAddr))
2508     State.setDebugLocFrom(I->getDebugLoc());
2509 
2510   // Notice current instruction could be any index. Need to adjust the address
2511   // to the member of index 0.
2512   //
2513   // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2514   //       b = A[i];       // Member of index 0
2515   // Current pointer is pointed to A[i+1], adjust it to A[i].
2516   //
2517   // E.g.  A[i+1] = a;     // Member of index 1
2518   //       A[i]   = b;     // Member of index 0
2519   //       A[i+2] = c;     // Member of index 2 (Current instruction)
2520   // Current pointer is pointed to A[i+2], adjust it to A[i].
2521 
2522   bool InBounds = false;
2523   if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2524     InBounds = gep->isInBounds();
2525   ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
2526 
2527   State.setDebugLocFrom(Instr->getDebugLoc());
2528   Value *PoisonVec = PoisonValue::get(VecTy);
2529 
2530   auto CreateGroupMask = [&BlockInMask, &State,
2531                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2532     if (State.VF.isScalable()) {
2533       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2534       assert(InterleaveFactor == 2 &&
2535              "Unsupported deinterleave factor for scalable vectors");
2536       auto *ResBlockInMask = State.get(BlockInMask);
2537       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2538       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2539                                      State.VF.getKnownMinValue() * 2, true);
2540       return State.Builder.CreateIntrinsic(
2541           MaskTy, Intrinsic::vector_interleave2, Ops,
2542           /*FMFSource=*/nullptr, "interleaved.mask");
2543     }
2544 
2545     if (!BlockInMask)
2546       return MaskForGaps;
2547 
2548     Value *ResBlockInMask = State.get(BlockInMask);
2549     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2550         ResBlockInMask,
2551         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2552         "interleaved.mask");
2553     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2554                                                    ShuffledMask, MaskForGaps)
2555                        : ShuffledMask;
2556   };
2557 
2558   const DataLayout &DL = Instr->getDataLayout();
2559   // Vectorize the interleaved load group.
2560   if (isa<LoadInst>(Instr)) {
2561     Value *MaskForGaps = nullptr;
2562     if (NeedsMaskForGaps) {
2563       MaskForGaps = createBitMaskForGaps(State.Builder,
2564                                          State.VF.getKnownMinValue(), *Group);
2565       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2566     }
2567 
2568     Instruction *NewLoad;
2569     if (BlockInMask || MaskForGaps) {
2570       Value *GroupMask = CreateGroupMask(MaskForGaps);
2571       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2572                                                Group->getAlign(), GroupMask,
2573                                                PoisonVec, "wide.masked.vec");
2574     } else
2575       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2576                                                 Group->getAlign(), "wide.vec");
2577     Group->addMetadata(NewLoad);
2578 
2579     ArrayRef<VPValue *> VPDefs = definedValues();
2580     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2581     if (VecTy->isScalableTy()) {
2582       assert(InterleaveFactor == 2 &&
2583              "Unsupported deinterleave factor for scalable vectors");
2584 
2585         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2586         // so must use intrinsics to deinterleave.
2587       Value *DI = State.Builder.CreateIntrinsic(
2588           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2589           /*FMFSource=*/nullptr, "strided.vec");
2590       unsigned J = 0;
2591       for (unsigned I = 0; I < InterleaveFactor; ++I) {
2592         Instruction *Member = Group->getMember(I);
2593 
2594         if (!Member)
2595           continue;
2596 
2597         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2598         // If this member has different type, cast the result type.
2599         if (Member->getType() != ScalarTy) {
2600           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2601           StridedVec =
2602               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2603         }
2604 
2605         if (Group->isReverse())
2606           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2607 
2608         State.set(VPDefs[J], StridedVec);
2609         ++J;
2610       }
2611 
2612       return;
2613     }
2614 
2615     // For each member in the group, shuffle out the appropriate data from the
2616     // wide loads.
2617     unsigned J = 0;
2618     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2619       Instruction *Member = Group->getMember(I);
2620 
2621       // Skip the gaps in the group.
2622       if (!Member)
2623         continue;
2624 
2625       auto StrideMask =
2626           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2627       Value *StridedVec =
2628           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
2629 
2630       // If this member has different type, cast the result type.
2631       if (Member->getType() != ScalarTy) {
2632         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2633         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2634         StridedVec =
2635             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2636       }
2637 
2638       if (Group->isReverse())
2639         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2640 
2641       State.set(VPDefs[J], StridedVec);
2642       ++J;
2643     }
2644     return;
2645   }
2646 
2647   // The sub vector type for current instruction.
2648   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2649 
2650   // Vectorize the interleaved store group.
2651   Value *MaskForGaps =
2652       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2653   assert((!MaskForGaps || !State.VF.isScalable()) &&
2654          "masking gaps for scalable vectors is not yet supported.");
2655   ArrayRef<VPValue *> StoredValues = getStoredValues();
2656   // Collect the stored vector from each member.
2657   SmallVector<Value *, 4> StoredVecs;
2658   unsigned StoredIdx = 0;
2659   for (unsigned i = 0; i < InterleaveFactor; i++) {
2660     assert((Group->getMember(i) || MaskForGaps) &&
2661            "Fail to get a member from an interleaved store group");
2662     Instruction *Member = Group->getMember(i);
2663 
2664     // Skip the gaps in the group.
2665     if (!Member) {
2666       Value *Undef = PoisonValue::get(SubVT);
2667       StoredVecs.push_back(Undef);
2668       continue;
2669     }
2670 
2671     Value *StoredVec = State.get(StoredValues[StoredIdx]);
2672     ++StoredIdx;
2673 
2674     if (Group->isReverse())
2675       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2676 
2677     // If this member has different type, cast it to a unified type.
2678 
2679     if (StoredVec->getType() != SubVT)
2680       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2681 
2682     StoredVecs.push_back(StoredVec);
2683   }
2684 
2685   // Interleave all the smaller vectors into one wider vector.
2686   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2687   Instruction *NewStoreInstr;
2688   if (BlockInMask || MaskForGaps) {
2689     Value *GroupMask = CreateGroupMask(MaskForGaps);
2690     NewStoreInstr = State.Builder.CreateMaskedStore(
2691         IVec, ResAddr, Group->getAlign(), GroupMask);
2692   } else
2693     NewStoreInstr =
2694         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
2695 
2696   Group->addMetadata(NewStoreInstr);
2697 }
2698 
2699 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2700 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2701                                VPSlotTracker &SlotTracker) const {
2702   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2703   IG->getInsertPos()->printAsOperand(O, false);
2704   O << ", ";
2705   getAddr()->printAsOperand(O, SlotTracker);
2706   VPValue *Mask = getMask();
2707   if (Mask) {
2708     O << ", ";
2709     Mask->printAsOperand(O, SlotTracker);
2710   }
2711 
2712   unsigned OpIdx = 0;
2713   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2714     if (!IG->getMember(i))
2715       continue;
2716     if (getNumStoreOperands() > 0) {
2717       O << "\n" << Indent << "  store ";
2718       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2719       O << " to index " << i;
2720     } else {
2721       O << "\n" << Indent << "  ";
2722       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2723       O << " = load from index " << i;
2724     }
2725     ++OpIdx;
2726   }
2727 }
2728 #endif
2729 
2730 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2731   Value *Start = getStartValue()->getLiveInIRValue();
2732   PHINode *Phi = PHINode::Create(Start->getType(), 2, "index");
2733   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2734 
2735   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2736   Phi->addIncoming(Start, VectorPH);
2737   Phi->setDebugLoc(getDebugLoc());
2738   State.set(this, Phi, /*IsScalar*/ true);
2739 }
2740 
2741 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2742 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2743                                    VPSlotTracker &SlotTracker) const {
2744   O << Indent << "EMIT ";
2745   printAsOperand(O, SlotTracker);
2746   O << " = CANONICAL-INDUCTION ";
2747   printOperands(O, SlotTracker);
2748 }
2749 #endif
2750 
2751 bool VPCanonicalIVPHIRecipe::isCanonical(
2752     InductionDescriptor::InductionKind Kind, VPValue *Start,
2753     VPValue *Step) const {
2754   // Must be an integer induction.
2755   if (Kind != InductionDescriptor::IK_IntInduction)
2756     return false;
2757   // Start must match the start value of this canonical induction.
2758   if (Start != getStartValue())
2759     return false;
2760 
2761   // If the step is defined by a recipe, it is not a ConstantInt.
2762   if (Step->getDefiningRecipe())
2763     return false;
2764 
2765   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2766   return StepC && StepC->isOne();
2767 }
2768 
2769 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2770   return IsScalarAfterVectorization &&
2771          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2772 }
2773 
2774 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2775   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2776          "Not a pointer induction according to InductionDescriptor!");
2777   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2778          "Unexpected type.");
2779   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2780          "Recipe should have been replaced");
2781 
2782   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2783   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
2784   unsigned CurrentPart = getUnrollPart(*this);
2785 
2786   // Build a pointer phi
2787   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2788   Type *ScStValueType = ScalarStartValue->getType();
2789 
2790   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2791   PHINode *NewPointerPhi = nullptr;
2792   if (CurrentPart == 0) {
2793     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2794                                     CanonicalIV->getIterator());
2795     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2796   } else {
2797     // The recipe has been unrolled. In that case, fetch the single pointer phi
2798     // shared among all unrolled parts of the recipe.
2799     auto *GEP =
2800         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
2801     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
2802   }
2803 
2804   // A pointer induction, performed by using a gep
2805   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2806   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
2807   Type *PhiType = IndDesc.getStep()->getType();
2808   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2809   // Add induction update using an incorrect block temporarily. The phi node
2810   // will be fixed after VPlan execution. Note that at this point the latch
2811   // block cannot be used, as it does not exist yet.
2812   // TODO: Model increment value in VPlan, by turning the recipe into a
2813   // multi-def and a subclass of VPHeaderPHIRecipe.
2814   if (CurrentPart == 0) {
2815     // The recipe represents the first part of the pointer induction. Create the
2816     // GEP to increment the phi across all unrolled parts.
2817     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
2818     Value *NumUnrolledElems =
2819         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
2820 
2821     Value *InductionGEP = GetElementPtrInst::Create(
2822         State.Builder.getInt8Ty(), NewPointerPhi,
2823         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2824         InductionLoc);
2825 
2826     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2827   }
2828 
2829   // Create actual address geps that use the pointer phi as base and a
2830   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
2831   Type *VecPhiType = VectorType::get(PhiType, State.VF);
2832   Value *StartOffsetScalar = State.Builder.CreateMul(
2833       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
2834   Value *StartOffset =
2835       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2836   // Create a vector of consecutive numbers from zero to VF.
2837   StartOffset = State.Builder.CreateAdd(
2838       StartOffset, State.Builder.CreateStepVector(VecPhiType));
2839 
2840   assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, 0)) &&
2841          "scalar step must be the same across all parts");
2842   Value *GEP = State.Builder.CreateGEP(
2843       State.Builder.getInt8Ty(), NewPointerPhi,
2844       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
2845                                                State.VF, ScalarStepValue)),
2846       "vector.gep");
2847   State.set(this, GEP);
2848 }
2849 
2850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2851 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2852                                           VPSlotTracker &SlotTracker) const {
2853   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
2854          "unexpected number of operands");
2855   O << Indent << "EMIT ";
2856   printAsOperand(O, SlotTracker);
2857   O << " = WIDEN-POINTER-INDUCTION ";
2858   getStartValue()->printAsOperand(O, SlotTracker);
2859   O << ", " << *IndDesc.getStep();
2860   if (getNumOperands() == 4) {
2861     O << ", ";
2862     getOperand(2)->printAsOperand(O, SlotTracker);
2863     O << ", ";
2864     getOperand(3)->printAsOperand(O, SlotTracker);
2865   }
2866 }
2867 #endif
2868 
2869 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2870   assert(!State.Instance && "cannot be used in per-lane");
2871   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2872   SCEVExpander Exp(SE, DL, "induction");
2873 
2874   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2875                                  &*State.Builder.GetInsertPoint());
2876   assert(!State.ExpandedSCEVs.contains(Expr) &&
2877          "Same SCEV expanded multiple times");
2878   State.ExpandedSCEVs[Expr] = Res;
2879   State.set(this, Res, {0, 0});
2880 }
2881 
2882 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2883 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2884                                VPSlotTracker &SlotTracker) const {
2885   O << Indent << "EMIT ";
2886   getVPSingleValue()->printAsOperand(O, SlotTracker);
2887   O << " = EXPAND SCEV " << *Expr;
2888 }
2889 #endif
2890 
2891 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2892   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
2893   Type *STy = CanonicalIV->getType();
2894   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2895   ElementCount VF = State.VF;
2896   Value *VStart = VF.isScalar()
2897                       ? CanonicalIV
2898                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2899   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
2900   if (VF.isVector()) {
2901     VStep = Builder.CreateVectorSplat(VF, VStep);
2902     VStep =
2903         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2904   }
2905   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2906   State.set(this, CanonicalVectorIV);
2907 }
2908 
2909 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2910 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2911                                      VPSlotTracker &SlotTracker) const {
2912   O << Indent << "EMIT ";
2913   printAsOperand(O, SlotTracker);
2914   O << " = WIDEN-CANONICAL-INDUCTION ";
2915   printOperands(O, SlotTracker);
2916 }
2917 #endif
2918 
2919 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2920   auto &Builder = State.Builder;
2921   // Create a vector from the initial value.
2922   auto *VectorInit = getStartValue()->getLiveInIRValue();
2923 
2924   Type *VecTy = State.VF.isScalar()
2925                     ? VectorInit->getType()
2926                     : VectorType::get(VectorInit->getType(), State.VF);
2927 
2928   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2929   if (State.VF.isVector()) {
2930     auto *IdxTy = Builder.getInt32Ty();
2931     auto *One = ConstantInt::get(IdxTy, 1);
2932     IRBuilder<>::InsertPointGuard Guard(Builder);
2933     Builder.SetInsertPoint(VectorPH->getTerminator());
2934     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2935     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2936     VectorInit = Builder.CreateInsertElement(
2937         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2938   }
2939 
2940   // Create a phi node for the new recurrence.
2941   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
2942   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2943   Phi->addIncoming(VectorInit, VectorPH);
2944   State.set(this, Phi);
2945 }
2946 
2947 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2948 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2949                                             VPSlotTracker &SlotTracker) const {
2950   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2951   printAsOperand(O, SlotTracker);
2952   O << " = phi ";
2953   printOperands(O, SlotTracker);
2954 }
2955 #endif
2956 
2957 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2958   auto &Builder = State.Builder;
2959 
2960   // Reductions do not have to start at zero. They can start with
2961   // any loop invariant values.
2962   VPValue *StartVPV = getStartValue();
2963   Value *StartV = StartVPV->getLiveInIRValue();
2964 
2965   // In order to support recurrences we need to be able to vectorize Phi nodes.
2966   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2967   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2968   // this value when we vectorize all of the instructions that use the PHI.
2969   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2970   Type *VecTy = ScalarPHI ? StartV->getType()
2971                           : VectorType::get(StartV->getType(), State.VF);
2972 
2973   BasicBlock *HeaderBB = State.CFG.PrevBB;
2974   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2975          "recipe must be in the vector loop header");
2976   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
2977   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
2978   State.set(this, Phi, IsInLoop);
2979 
2980   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2981 
2982   Value *Iden = nullptr;
2983   RecurKind RK = RdxDesc.getRecurrenceKind();
2984   unsigned CurrentPart = getUnrollPart(*this);
2985 
2986   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2987       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2988     // MinMax and AnyOf reductions have the start value as their identity.
2989     if (ScalarPHI) {
2990       Iden = StartV;
2991     } else {
2992       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2993       Builder.SetInsertPoint(VectorPH->getTerminator());
2994       StartV = Iden = State.get(StartVPV);
2995     }
2996   } else {
2997     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
2998                                        RdxDesc.getFastMathFlags());
2999 
3000     if (!ScalarPHI) {
3001       if (CurrentPart == 0) {
3002         // Create start and identity vector values for the reduction in the
3003         // preheader.
3004         // TODO: Introduce recipes in VPlan preheader to create initial values.
3005         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3006         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3007         Builder.SetInsertPoint(VectorPH->getTerminator());
3008         Constant *Zero = Builder.getInt32(0);
3009         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3010       } else {
3011         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3012       }
3013     }
3014   }
3015 
3016   Phi = cast<PHINode>(State.get(this, IsInLoop));
3017   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3018   Phi->addIncoming(StartVal, VectorPH);
3019 }
3020 
3021 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3022 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3023                                  VPSlotTracker &SlotTracker) const {
3024   O << Indent << "WIDEN-REDUCTION-PHI ";
3025 
3026   printAsOperand(O, SlotTracker);
3027   O << " = phi ";
3028   printOperands(O, SlotTracker);
3029 }
3030 #endif
3031 
3032 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3033   assert(EnableVPlanNativePath &&
3034          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3035 
3036   Value *Op0 = State.get(getOperand(0));
3037   Type *VecTy = Op0->getType();
3038   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3039   State.set(this, VecPhi);
3040 }
3041 
3042 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3043 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3044                              VPSlotTracker &SlotTracker) const {
3045   O << Indent << "WIDEN-PHI ";
3046 
3047   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3048   // Unless all incoming values are modeled in VPlan  print the original PHI
3049   // directly.
3050   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3051   // values as VPValues.
3052   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3053     O << VPlanIngredient(OriginalPhi);
3054     return;
3055   }
3056 
3057   printAsOperand(O, SlotTracker);
3058   O << " = phi ";
3059   printOperands(O, SlotTracker);
3060 }
3061 #endif
3062 
3063 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3064 // remove VPActiveLaneMaskPHIRecipe.
3065 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3066   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3067   Value *StartMask = State.get(getOperand(0));
3068   PHINode *Phi =
3069       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3070   Phi->addIncoming(StartMask, VectorPH);
3071   Phi->setDebugLoc(getDebugLoc());
3072   State.set(this, Phi);
3073 }
3074 
3075 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3076 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3077                                       VPSlotTracker &SlotTracker) const {
3078   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3079 
3080   printAsOperand(O, SlotTracker);
3081   O << " = phi ";
3082   printOperands(O, SlotTracker);
3083 }
3084 #endif
3085 
3086 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3087   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3088   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
3089   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3090   Phi->addIncoming(Start, VectorPH);
3091   Phi->setDebugLoc(getDebugLoc());
3092   State.set(this, Phi, /*IsScalar=*/true);
3093 }
3094 
3095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3096 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3097                                   VPSlotTracker &SlotTracker) const {
3098   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3099 
3100   printAsOperand(O, SlotTracker);
3101   O << " = phi ";
3102   printOperands(O, SlotTracker);
3103 }
3104 #endif
3105