xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision ce192b87b2a09ee27e4077763db0486921a485c0)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Type.h"
26 #include "llvm/IR/Value.h"
27 #include "llvm/IR/VectorBuilder.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
33 #include "llvm/Transforms/Utils/LoopUtils.h"
34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
35 #include <cassert>
36 
37 using namespace llvm;
38 
39 using VectorParts = SmallVector<Value *, 2>;
40 
41 namespace llvm {
42 extern cl::opt<bool> EnableVPlanNativePath;
43 }
44 extern cl::opt<unsigned> ForceTargetInstructionCost;
45 
46 #define LV_NAME "loop-vectorize"
47 #define DEBUG_TYPE LV_NAME
48 
49 bool VPRecipeBase::mayWriteToMemory() const {
50   switch (getVPDefID()) {
51   case VPInterleaveSC:
52     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
53   case VPWidenStoreEVLSC:
54   case VPWidenStoreSC:
55     return true;
56   case VPReplicateSC:
57     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
58         ->mayWriteToMemory();
59   case VPWidenCallSC:
60     return !cast<VPWidenCallRecipe>(this)
61                 ->getCalledScalarFunction()
62                 ->onlyReadsMemory();
63   case VPBranchOnMaskSC:
64   case VPScalarIVStepsSC:
65   case VPPredInstPHISC:
66     return false;
67   case VPBlendSC:
68   case VPReductionEVLSC:
69   case VPReductionSC:
70   case VPWidenCanonicalIVSC:
71   case VPWidenCastSC:
72   case VPWidenGEPSC:
73   case VPWidenIntOrFpInductionSC:
74   case VPWidenLoadEVLSC:
75   case VPWidenLoadSC:
76   case VPWidenPHISC:
77   case VPWidenSC:
78   case VPWidenEVLSC:
79   case VPWidenSelectSC: {
80     const Instruction *I =
81         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
82     (void)I;
83     assert((!I || !I->mayWriteToMemory()) &&
84            "underlying instruction may write to memory");
85     return false;
86   }
87   default:
88     return true;
89   }
90 }
91 
92 bool VPRecipeBase::mayReadFromMemory() const {
93   switch (getVPDefID()) {
94   case VPWidenLoadEVLSC:
95   case VPWidenLoadSC:
96     return true;
97   case VPReplicateSC:
98     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
99         ->mayReadFromMemory();
100   case VPWidenCallSC:
101     return !cast<VPWidenCallRecipe>(this)
102                 ->getCalledScalarFunction()
103                 ->onlyWritesMemory();
104   case VPBranchOnMaskSC:
105   case VPPredInstPHISC:
106   case VPScalarIVStepsSC:
107   case VPWidenStoreEVLSC:
108   case VPWidenStoreSC:
109     return false;
110   case VPBlendSC:
111   case VPReductionEVLSC:
112   case VPReductionSC:
113   case VPWidenCanonicalIVSC:
114   case VPWidenCastSC:
115   case VPWidenGEPSC:
116   case VPWidenIntOrFpInductionSC:
117   case VPWidenPHISC:
118   case VPWidenSC:
119   case VPWidenEVLSC:
120   case VPWidenSelectSC: {
121     const Instruction *I =
122         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123     (void)I;
124     assert((!I || !I->mayReadFromMemory()) &&
125            "underlying instruction may read from memory");
126     return false;
127   }
128   default:
129     return true;
130   }
131 }
132 
133 bool VPRecipeBase::mayHaveSideEffects() const {
134   switch (getVPDefID()) {
135   case VPDerivedIVSC:
136   case VPPredInstPHISC:
137   case VPScalarCastSC:
138     return false;
139   case VPInstructionSC:
140     switch (cast<VPInstruction>(this)->getOpcode()) {
141     case Instruction::Or:
142     case Instruction::ICmp:
143     case Instruction::Select:
144     case VPInstruction::Not:
145     case VPInstruction::CalculateTripCountMinusVF:
146     case VPInstruction::CanonicalIVIncrementForPart:
147     case VPInstruction::ExtractFromEnd:
148     case VPInstruction::FirstOrderRecurrenceSplice:
149     case VPInstruction::LogicalAnd:
150     case VPInstruction::PtrAdd:
151       return false;
152     default:
153       return true;
154     }
155   case VPWidenCallSC: {
156     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
157     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
158   }
159   case VPBlendSC:
160   case VPReductionEVLSC:
161   case VPReductionSC:
162   case VPScalarIVStepsSC:
163   case VPWidenCanonicalIVSC:
164   case VPWidenCastSC:
165   case VPWidenGEPSC:
166   case VPWidenIntOrFpInductionSC:
167   case VPWidenPHISC:
168   case VPWidenPointerInductionSC:
169   case VPWidenSC:
170   case VPWidenEVLSC:
171   case VPWidenSelectSC: {
172     const Instruction *I =
173         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
174     (void)I;
175     assert((!I || !I->mayHaveSideEffects()) &&
176            "underlying instruction has side-effects");
177     return false;
178   }
179   case VPInterleaveSC:
180     return mayWriteToMemory();
181   case VPWidenLoadEVLSC:
182   case VPWidenLoadSC:
183   case VPWidenStoreEVLSC:
184   case VPWidenStoreSC:
185     assert(
186         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
187             mayWriteToMemory() &&
188         "mayHaveSideffects result for ingredient differs from this "
189         "implementation");
190     return mayWriteToMemory();
191   case VPReplicateSC: {
192     auto *R = cast<VPReplicateRecipe>(this);
193     return R->getUnderlyingInstr()->mayHaveSideEffects();
194   }
195   default:
196     return true;
197   }
198 }
199 
200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
201   VPValue *ExitValue = getOperand(0);
202   VPBasicBlock *MiddleVPBB =
203       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
204   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
205   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
206   // Values leaving the vector loop reach live out phi's in the exiting block
207   // via middle block.
208   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
209                        ? MiddleVPBB
210                        : ExitingVPBB;
211   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
212   Value *V = State.get(ExitValue, VPIteration(0, 0));
213   if (Phi->getBasicBlockIndex(PredBB) != -1)
214     Phi->setIncomingValueForBlock(PredBB, V);
215   else
216     Phi->addIncoming(V, PredBB);
217 }
218 
219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
221   O << "Live-out ";
222   getPhi()->printAsOperand(O);
223   O << " = ";
224   getOperand(0)->printAsOperand(O, SlotTracker);
225   O << "\n";
226 }
227 #endif
228 
229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
230   assert(!Parent && "Recipe already in some VPBasicBlock");
231   assert(InsertPos->getParent() &&
232          "Insertion position not in any VPBasicBlock");
233   InsertPos->getParent()->insert(this, InsertPos->getIterator());
234 }
235 
236 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237                                 iplist<VPRecipeBase>::iterator I) {
238   assert(!Parent && "Recipe already in some VPBasicBlock");
239   assert(I == BB.end() || I->getParent() == &BB);
240   BB.insert(this, I);
241 }
242 
243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
244   assert(!Parent && "Recipe already in some VPBasicBlock");
245   assert(InsertPos->getParent() &&
246          "Insertion position not in any VPBasicBlock");
247   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
248 }
249 
250 void VPRecipeBase::removeFromParent() {
251   assert(getParent() && "Recipe not in any VPBasicBlock");
252   getParent()->getRecipeList().remove(getIterator());
253   Parent = nullptr;
254 }
255 
256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
257   assert(getParent() && "Recipe not in any VPBasicBlock");
258   return getParent()->getRecipeList().erase(getIterator());
259 }
260 
261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
262   removeFromParent();
263   insertAfter(InsertPos);
264 }
265 
266 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
267                               iplist<VPRecipeBase>::iterator I) {
268   removeFromParent();
269   insertBefore(BB, I);
270 }
271 
272 /// Return the underlying instruction to be used for computing \p R's cost via
273 /// the legacy cost model. Return nullptr if there's no suitable instruction.
274 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
275   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
276     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
277   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
278     return IG->getInsertPos();
279   // Currently the legacy cost model only calculates the instruction cost with
280   // underlying instruction. Removing the WidenMem here will prevent
281   // force-target-instruction-cost overwriting the cost of recipe with
282   // underlying instruction which is inconsistent with the legacy model.
283   // TODO: Remove WidenMem from this function when we don't need to compare to
284   // the legacy model.
285   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
286     return &WidenMem->getIngredient();
287   return nullptr;
288 }
289 
290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
291   auto *UI = getInstructionForCost(this);
292   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
293     return 0;
294 
295   InstructionCost RecipeCost = computeCost(VF, Ctx);
296   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
297       RecipeCost.isValid())
298     RecipeCost = InstructionCost(ForceTargetInstructionCost);
299 
300   LLVM_DEBUG({
301     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302     dump();
303   });
304   return RecipeCost;
305 }
306 
307 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
308                                           VPCostContext &Ctx) const {
309   // Compute the cost for the recipe falling back to the legacy cost model using
310   // the underlying instruction. If there is no underlying instruction, returns
311   // 0.
312   Instruction *UI = getInstructionForCost(this);
313   if (UI && isa<VPReplicateRecipe>(this)) {
314     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
315     // transform, avoid computing their cost multiple times for now.
316     Ctx.SkipCostComputation.insert(UI);
317   }
318   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
319 }
320 
321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
322   assert(OpType == OperationType::FPMathOp &&
323          "recipe doesn't have fast math flags");
324   FastMathFlags Res;
325   Res.setAllowReassoc(FMFs.AllowReassoc);
326   Res.setNoNaNs(FMFs.NoNaNs);
327   Res.setNoInfs(FMFs.NoInfs);
328   Res.setNoSignedZeros(FMFs.NoSignedZeros);
329   Res.setAllowReciprocal(FMFs.AllowReciprocal);
330   Res.setAllowContract(FMFs.AllowContract);
331   Res.setApproxFunc(FMFs.ApproxFunc);
332   return Res;
333 }
334 
335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
336                              VPValue *A, VPValue *B, DebugLoc DL,
337                              const Twine &Name)
338     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
339                           Pred, DL),
340       Opcode(Opcode), Name(Name.str()) {
341   assert(Opcode == Instruction::ICmp &&
342          "only ICmp predicates supported at the moment");
343 }
344 
345 VPInstruction::VPInstruction(unsigned Opcode,
346                              std::initializer_list<VPValue *> Operands,
347                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
348     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
349       Opcode(Opcode), Name(Name.str()) {
350   // Make sure the VPInstruction is a floating-point operation.
351   assert(isFPMathOp() && "this op can't take fast-math flags");
352 }
353 
354 bool VPInstruction::doesGeneratePerAllLanes() const {
355   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
356 }
357 
358 bool VPInstruction::canGenerateScalarForFirstLane() const {
359   if (Instruction::isBinaryOp(getOpcode()))
360     return true;
361   if (isSingleScalar() || isVectorToScalar())
362     return true;
363   switch (Opcode) {
364   case Instruction::ICmp:
365   case VPInstruction::BranchOnCond:
366   case VPInstruction::BranchOnCount:
367   case VPInstruction::CalculateTripCountMinusVF:
368   case VPInstruction::CanonicalIVIncrementForPart:
369   case VPInstruction::PtrAdd:
370   case VPInstruction::ExplicitVectorLength:
371     return true;
372   default:
373     return false;
374   }
375 }
376 
377 Value *VPInstruction::generatePerLane(VPTransformState &State,
378                                       const VPIteration &Lane) {
379   IRBuilderBase &Builder = State.Builder;
380 
381   assert(getOpcode() == VPInstruction::PtrAdd &&
382          "only PtrAdd opcodes are supported for now");
383   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
384                               State.get(getOperand(1), Lane), Name);
385 }
386 
387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
388   IRBuilderBase &Builder = State.Builder;
389 
390   if (Instruction::isBinaryOp(getOpcode())) {
391     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
392     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
393     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
394     auto *Res =
395         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
396     if (auto *I = dyn_cast<Instruction>(Res))
397       setFlags(I);
398     return Res;
399   }
400 
401   switch (getOpcode()) {
402   case VPInstruction::Not: {
403     Value *A = State.get(getOperand(0), Part);
404     return Builder.CreateNot(A, Name);
405   }
406   case Instruction::ICmp: {
407     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
408     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
409     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
410     return Builder.CreateCmp(getPredicate(), A, B, Name);
411   }
412   case Instruction::Select: {
413     Value *Cond = State.get(getOperand(0), Part);
414     Value *Op1 = State.get(getOperand(1), Part);
415     Value *Op2 = State.get(getOperand(2), Part);
416     return Builder.CreateSelect(Cond, Op1, Op2, Name);
417   }
418   case VPInstruction::ActiveLaneMask: {
419     // Get first lane of vector induction variable.
420     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
421     // Get the original loop tripcount.
422     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
423 
424     // If this part of the active lane mask is scalar, generate the CMP directly
425     // to avoid unnecessary extracts.
426     if (State.VF.isScalar())
427       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
428                                Name);
429 
430     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
431     auto *PredTy = VectorType::get(Int1Ty, State.VF);
432     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
433                                    {PredTy, ScalarTC->getType()},
434                                    {VIVElem0, ScalarTC}, nullptr, Name);
435   }
436   case VPInstruction::FirstOrderRecurrenceSplice: {
437     // Generate code to combine the previous and current values in vector v3.
438     //
439     //   vector.ph:
440     //     v_init = vector(..., ..., ..., a[-1])
441     //     br vector.body
442     //
443     //   vector.body
444     //     i = phi [0, vector.ph], [i+4, vector.body]
445     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
446     //     v2 = a[i, i+1, i+2, i+3];
447     //     v3 = vector(v1(3), v2(0, 1, 2))
448 
449     // For the first part, use the recurrence phi (v1), otherwise v2.
450     auto *V1 = State.get(getOperand(0), 0);
451     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
452     if (!PartMinus1->getType()->isVectorTy())
453       return PartMinus1;
454     Value *V2 = State.get(getOperand(1), Part);
455     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
456   }
457   case VPInstruction::CalculateTripCountMinusVF: {
458     if (Part != 0)
459       return State.get(this, 0, /*IsScalar*/ true);
460 
461     Value *ScalarTC = State.get(getOperand(0), {0, 0});
462     Value *Step =
463         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
464     Value *Sub = Builder.CreateSub(ScalarTC, Step);
465     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
466     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
467     return Builder.CreateSelect(Cmp, Sub, Zero);
468   }
469   case VPInstruction::ExplicitVectorLength: {
470     // Compute EVL
471     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
472       assert(AVL->getType()->isIntegerTy() &&
473              "Requested vector length should be an integer.");
474 
475       // TODO: Add support for MaxSafeDist for correct loop emission.
476       assert(State.VF.isScalable() && "Expected scalable vector factor.");
477       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
478 
479       Value *EVL = State.Builder.CreateIntrinsic(
480           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
481           {AVL, VFArg, State.Builder.getTrue()});
482       return EVL;
483     };
484     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
485     // be outside of the main loop.
486     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
487     // Compute VTC - IV as the AVL (requested vector length).
488     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
489     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
490     Value *AVL = State.Builder.CreateSub(TripCount, Index);
491     Value *EVL = GetEVL(State, AVL);
492     return EVL;
493   }
494   case VPInstruction::CanonicalIVIncrementForPart: {
495     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
496     if (Part == 0)
497       return IV;
498 
499     // The canonical IV is incremented by the vectorization factor (num of SIMD
500     // elements) times the unroll part.
501     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
502     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
503                              hasNoSignedWrap());
504   }
505   case VPInstruction::BranchOnCond: {
506     if (Part != 0)
507       return nullptr;
508 
509     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
510     // Replace the temporary unreachable terminator with a new conditional
511     // branch, hooking it up to backward destination for exiting blocks now and
512     // to forward destination(s) later when they are created.
513     BranchInst *CondBr =
514         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
515     CondBr->setSuccessor(0, nullptr);
516     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
517 
518     if (!getParent()->isExiting())
519       return CondBr;
520 
521     VPRegionBlock *ParentRegion = getParent()->getParent();
522     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
523     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
524     return CondBr;
525   }
526   case VPInstruction::BranchOnCount: {
527     if (Part != 0)
528       return nullptr;
529     // First create the compare.
530     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
531     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
532     Value *Cond = Builder.CreateICmpEQ(IV, TC);
533 
534     // Now create the branch.
535     auto *Plan = getParent()->getPlan();
536     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
537     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
538 
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination (the header) now and to the
541     // forward destination (the exit/middle block) later when it is created.
542     // Note that CreateCondBr expects a valid BB as first argument, so we need
543     // to set it to nullptr later.
544     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
545                                               State.CFG.VPBB2IRBB[Header]);
546     CondBr->setSuccessor(0, nullptr);
547     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
548     return CondBr;
549   }
550   case VPInstruction::ComputeReductionResult: {
551     if (Part != 0)
552       return State.get(this, 0, /*IsScalar*/ true);
553 
554     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
555     // and will be removed by breaking up the recipe further.
556     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
557     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
558     // Get its reduction variable descriptor.
559     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
560 
561     RecurKind RK = RdxDesc.getRecurrenceKind();
562 
563     VPValue *LoopExitingDef = getOperand(1);
564     Type *PhiTy = OrigPhi->getType();
565     VectorParts RdxParts(State.UF);
566     for (unsigned Part = 0; Part < State.UF; ++Part)
567       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
568 
569     // If the vector reduction can be performed in a smaller type, we truncate
570     // then extend the loop exit value to enable InstCombine to evaluate the
571     // entire expression in the smaller type.
572     // TODO: Handle this in truncateToMinBW.
573     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
574       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
575       for (unsigned Part = 0; Part < State.UF; ++Part)
576         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
577     }
578     // Reduce all of the unrolled parts into a single vector.
579     Value *ReducedPartRdx = RdxParts[0];
580     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
581     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
582       Op = Instruction::Or;
583 
584     if (PhiR->isOrdered()) {
585       ReducedPartRdx = RdxParts[State.UF - 1];
586     } else {
587       // Floating-point operations should have some FMF to enable the reduction.
588       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
589       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
590       for (unsigned Part = 1; Part < State.UF; ++Part) {
591         Value *RdxPart = RdxParts[Part];
592         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
593           ReducedPartRdx = Builder.CreateBinOp(
594               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
595         else
596           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
597       }
598     }
599 
600     // Create the reduction after the loop. Note that inloop reductions create
601     // the target reduction in the loop using a Reduction recipe.
602     if ((State.VF.isVector() ||
603          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
604         !PhiR->isInLoop()) {
605       ReducedPartRdx =
606           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
607       // If the reduction can be performed in a smaller type, we need to extend
608       // the reduction to the wider type before we branch to the original loop.
609       if (PhiTy != RdxDesc.getRecurrenceType())
610         ReducedPartRdx = RdxDesc.isSigned()
611                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
612                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
613     }
614 
615     // If there were stores of the reduction value to a uniform memory address
616     // inside the loop, create the final store here.
617     if (StoreInst *SI = RdxDesc.IntermediateStore) {
618       auto *NewSI = Builder.CreateAlignedStore(
619           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
620       propagateMetadata(NewSI, SI);
621     }
622 
623     return ReducedPartRdx;
624   }
625   case VPInstruction::ExtractFromEnd: {
626     if (Part != 0)
627       return State.get(this, 0, /*IsScalar*/ true);
628 
629     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
630     unsigned Offset = CI->getZExtValue();
631     assert(Offset > 0 && "Offset from end must be positive");
632     Value *Res;
633     if (State.VF.isVector()) {
634       assert(Offset <= State.VF.getKnownMinValue() &&
635              "invalid offset to extract from");
636       // Extract lane VF - Offset from the operand.
637       Res = State.get(
638           getOperand(0),
639           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
640     } else {
641       assert(Offset <= State.UF && "invalid offset to extract from");
642       // When loop is unrolled without vectorizing, retrieve UF - Offset.
643       Res = State.get(getOperand(0), State.UF - Offset);
644     }
645     if (isa<ExtractElementInst>(Res))
646       Res->setName(Name);
647     return Res;
648   }
649   case VPInstruction::LogicalAnd: {
650     Value *A = State.get(getOperand(0), Part);
651     Value *B = State.get(getOperand(1), Part);
652     return Builder.CreateLogicalAnd(A, B, Name);
653   }
654   case VPInstruction::PtrAdd: {
655     assert(vputils::onlyFirstLaneUsed(this) &&
656            "can only generate first lane for PtrAdd");
657     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
658     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
659     return Builder.CreatePtrAdd(Ptr, Addend, Name);
660   }
661   case VPInstruction::ResumePhi: {
662     if (Part != 0)
663       return State.get(this, 0, /*IsScalar*/ true);
664     Value *IncomingFromVPlanPred =
665         State.get(getOperand(0), Part, /* IsScalar */ true);
666     Value *IncomingFromOtherPreds =
667         State.get(getOperand(1), Part, /* IsScalar */ true);
668     auto *NewPhi =
669         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
670     BasicBlock *VPlanPred =
671         State.CFG
672             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
673     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
674     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
675       assert(OtherPred != VPlanPred &&
676              "VPlan predecessors should not be connected yet");
677       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
678     }
679     return NewPhi;
680   }
681 
682   default:
683     llvm_unreachable("Unsupported opcode for instruction");
684   }
685 }
686 
687 bool VPInstruction::isVectorToScalar() const {
688   return getOpcode() == VPInstruction::ExtractFromEnd ||
689          getOpcode() == VPInstruction::ComputeReductionResult;
690 }
691 
692 bool VPInstruction::isSingleScalar() const {
693   return getOpcode() == VPInstruction::ResumePhi;
694 }
695 
696 #if !defined(NDEBUG)
697 bool VPInstruction::isFPMathOp() const {
698   // Inspired by FPMathOperator::classof. Notable differences are that we don't
699   // support Call, PHI and Select opcodes here yet.
700   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
701          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
702          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
703          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
704 }
705 #endif
706 
707 void VPInstruction::execute(VPTransformState &State) {
708   assert(!State.Instance && "VPInstruction executing an Instance");
709   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
710   assert((hasFastMathFlags() == isFPMathOp() ||
711           getOpcode() == Instruction::Select) &&
712          "Recipe not a FPMathOp but has fast-math flags?");
713   if (hasFastMathFlags())
714     State.Builder.setFastMathFlags(getFastMathFlags());
715   State.setDebugLocFrom(getDebugLoc());
716   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
717                                    (vputils::onlyFirstLaneUsed(this) ||
718                                     isVectorToScalar() || isSingleScalar());
719   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
720   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
721   for (unsigned Part = 0; Part < State.UF; ++Part) {
722     if (GeneratesPerAllLanes) {
723       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
724            Lane != NumLanes; ++Lane) {
725         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
726         assert(GeneratedValue && "generatePerLane must produce a value");
727         State.set(this, GeneratedValue, VPIteration(Part, Lane));
728       }
729       continue;
730     }
731 
732     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
733       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
734       State.set(this, Part0, Part,
735                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
736       continue;
737     }
738 
739     Value *GeneratedValue = generatePerPart(State, Part);
740     if (!hasResult())
741       continue;
742     assert(GeneratedValue && "generatePerPart must produce a value");
743     assert((GeneratedValue->getType()->isVectorTy() ==
744                 !GeneratesPerFirstLaneOnly ||
745             State.VF.isScalar()) &&
746            "scalar value but not only first lane defined");
747     State.set(this, GeneratedValue, Part,
748               /*IsScalar*/ GeneratesPerFirstLaneOnly);
749   }
750 }
751 
752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
753   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
754   if (Instruction::isBinaryOp(getOpcode()))
755     return vputils::onlyFirstLaneUsed(this);
756 
757   switch (getOpcode()) {
758   default:
759     return false;
760   case Instruction::ICmp:
761   case VPInstruction::PtrAdd:
762     // TODO: Cover additional opcodes.
763     return vputils::onlyFirstLaneUsed(this);
764   case VPInstruction::ActiveLaneMask:
765   case VPInstruction::ExplicitVectorLength:
766   case VPInstruction::CalculateTripCountMinusVF:
767   case VPInstruction::CanonicalIVIncrementForPart:
768   case VPInstruction::BranchOnCount:
769   case VPInstruction::BranchOnCond:
770   case VPInstruction::ResumePhi:
771     return true;
772   };
773   llvm_unreachable("switch should return");
774 }
775 
776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
777   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
778   if (Instruction::isBinaryOp(getOpcode()))
779     return vputils::onlyFirstPartUsed(this);
780 
781   switch (getOpcode()) {
782   default:
783     return false;
784   case Instruction::ICmp:
785   case Instruction::Select:
786     return vputils::onlyFirstPartUsed(this);
787   case VPInstruction::BranchOnCount:
788   case VPInstruction::BranchOnCond:
789   case VPInstruction::CanonicalIVIncrementForPart:
790     return true;
791   };
792   llvm_unreachable("switch should return");
793 }
794 
795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
796 void VPInstruction::dump() const {
797   VPSlotTracker SlotTracker(getParent()->getPlan());
798   print(dbgs(), "", SlotTracker);
799 }
800 
801 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
802                           VPSlotTracker &SlotTracker) const {
803   O << Indent << "EMIT ";
804 
805   if (hasResult()) {
806     printAsOperand(O, SlotTracker);
807     O << " = ";
808   }
809 
810   switch (getOpcode()) {
811   case VPInstruction::Not:
812     O << "not";
813     break;
814   case VPInstruction::SLPLoad:
815     O << "combined load";
816     break;
817   case VPInstruction::SLPStore:
818     O << "combined store";
819     break;
820   case VPInstruction::ActiveLaneMask:
821     O << "active lane mask";
822     break;
823   case VPInstruction::ResumePhi:
824     O << "resume-phi";
825     break;
826   case VPInstruction::ExplicitVectorLength:
827     O << "EXPLICIT-VECTOR-LENGTH";
828     break;
829   case VPInstruction::FirstOrderRecurrenceSplice:
830     O << "first-order splice";
831     break;
832   case VPInstruction::BranchOnCond:
833     O << "branch-on-cond";
834     break;
835   case VPInstruction::CalculateTripCountMinusVF:
836     O << "TC > VF ? TC - VF : 0";
837     break;
838   case VPInstruction::CanonicalIVIncrementForPart:
839     O << "VF * Part +";
840     break;
841   case VPInstruction::BranchOnCount:
842     O << "branch-on-count";
843     break;
844   case VPInstruction::ExtractFromEnd:
845     O << "extract-from-end";
846     break;
847   case VPInstruction::ComputeReductionResult:
848     O << "compute-reduction-result";
849     break;
850   case VPInstruction::LogicalAnd:
851     O << "logical-and";
852     break;
853   case VPInstruction::PtrAdd:
854     O << "ptradd";
855     break;
856   default:
857     O << Instruction::getOpcodeName(getOpcode());
858   }
859 
860   printFlags(O);
861   printOperands(O, SlotTracker);
862 
863   if (auto DL = getDebugLoc()) {
864     O << ", !dbg ";
865     DL.print(O);
866   }
867 }
868 #endif
869 
870 void VPWidenCallRecipe::execute(VPTransformState &State) {
871   assert(State.VF.isVector() && "not widening");
872   Function *CalledScalarFn = getCalledScalarFunction();
873   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
874          "DbgInfoIntrinsic should have been dropped during VPlan construction");
875   State.setDebugLocFrom(getDebugLoc());
876 
877   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
878   FunctionType *VFTy = nullptr;
879   if (Variant)
880     VFTy = Variant->getFunctionType();
881   for (unsigned Part = 0; Part < State.UF; ++Part) {
882     SmallVector<Type *, 2> TysForDecl;
883     // Add return type if intrinsic is overloaded on it.
884     if (UseIntrinsic &&
885         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
886       TysForDecl.push_back(VectorType::get(
887           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
888     SmallVector<Value *, 4> Args;
889     for (const auto &I : enumerate(arg_operands())) {
890       // Some intrinsics have a scalar argument - don't replace it with a
891       // vector.
892       Value *Arg;
893       if (UseIntrinsic &&
894           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
895         Arg = State.get(I.value(), VPIteration(0, 0));
896       // Some vectorized function variants may also take a scalar argument,
897       // e.g. linear parameters for pointers. This needs to be the scalar value
898       // from the start of the respective part when interleaving.
899       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
900         Arg = State.get(I.value(), VPIteration(Part, 0));
901       else
902         Arg = State.get(I.value(), Part);
903       if (UseIntrinsic &&
904           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
905         TysForDecl.push_back(Arg->getType());
906       Args.push_back(Arg);
907     }
908 
909     Function *VectorF;
910     if (UseIntrinsic) {
911       // Use vector version of the intrinsic.
912       Module *M = State.Builder.GetInsertBlock()->getModule();
913       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
914       assert(VectorF && "Can't retrieve vector intrinsic.");
915     } else {
916 #ifndef NDEBUG
917       assert(Variant != nullptr && "Can't create vector function.");
918 #endif
919       VectorF = Variant;
920     }
921 
922     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
923     SmallVector<OperandBundleDef, 1> OpBundles;
924     if (CI)
925       CI->getOperandBundlesAsDefs(OpBundles);
926 
927     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
928 
929     if (isa<FPMathOperator>(V))
930       V->copyFastMathFlags(CI);
931 
932     if (!V->getType()->isVoidTy())
933       State.set(this, V, Part);
934     State.addMetadata(V, CI);
935   }
936 }
937 
938 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
939                                                VPCostContext &Ctx) const {
940   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
941   if (Variant) {
942     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
943                                     Variant->getFunctionType()->params(),
944                                     CostKind);
945   }
946 
947   FastMathFlags FMF;
948   // TODO: Manage flags via VPRecipeWithIRFlags.
949   if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
950     FMF = FPMO->getFastMathFlags();
951 
952   // Some backends analyze intrinsic arguments to determine cost. Use the
953   // underlying value for the operand if it has one. Otherwise try to use the
954   // operand of the underlying call instruction, if there is one. Otherwise
955   // clear Arguments.
956   // TODO: Rework TTI interface to be independent of concrete IR values.
957   SmallVector<const Value *> Arguments;
958   for (const auto &[Idx, Op] : enumerate(operands())) {
959     auto *V = Op->getUnderlyingValue();
960     if (!V) {
961       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
962         Arguments.push_back(UI->getArgOperand(Idx));
963         continue;
964       }
965       Arguments.clear();
966       break;
967     }
968     Arguments.push_back(V);
969   }
970 
971   Type *RetTy =
972       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
973   SmallVector<Type *> ParamTys;
974   for (unsigned I = 0; I != getNumOperands(); ++I)
975     ParamTys.push_back(
976         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
977 
978   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
979   IntrinsicCostAttributes CostAttrs(
980       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
981       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
982   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
983 }
984 
985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
986 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
987                               VPSlotTracker &SlotTracker) const {
988   O << Indent << "WIDEN-CALL ";
989 
990   Function *CalledFn = getCalledScalarFunction();
991   if (CalledFn->getReturnType()->isVoidTy())
992     O << "void ";
993   else {
994     printAsOperand(O, SlotTracker);
995     O << " = ";
996   }
997 
998   O << "call @" << CalledFn->getName() << "(";
999   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1000     Op->printAsOperand(O, SlotTracker);
1001   });
1002   O << ")";
1003 
1004   if (VectorIntrinsicID)
1005     O << " (using vector intrinsic)";
1006   else {
1007     O << " (using library function";
1008     if (Variant->hasName())
1009       O << ": " << Variant->getName();
1010     O << ")";
1011   }
1012 }
1013 
1014 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1015                                 VPSlotTracker &SlotTracker) const {
1016   O << Indent << "WIDEN-SELECT ";
1017   printAsOperand(O, SlotTracker);
1018   O << " = select ";
1019   getOperand(0)->printAsOperand(O, SlotTracker);
1020   O << ", ";
1021   getOperand(1)->printAsOperand(O, SlotTracker);
1022   O << ", ";
1023   getOperand(2)->printAsOperand(O, SlotTracker);
1024   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1025 }
1026 #endif
1027 
1028 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1029   State.setDebugLocFrom(getDebugLoc());
1030 
1031   // The condition can be loop invariant but still defined inside the
1032   // loop. This means that we can't just use the original 'cond' value.
1033   // We have to take the 'vectorized' value and pick the first lane.
1034   // Instcombine will make this a no-op.
1035   auto *InvarCond =
1036       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
1037 
1038   for (unsigned Part = 0; Part < State.UF; ++Part) {
1039     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
1040     Value *Op0 = State.get(getOperand(1), Part);
1041     Value *Op1 = State.get(getOperand(2), Part);
1042     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1043     State.set(this, Sel, Part);
1044     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1045   }
1046 }
1047 
1048 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1049     const FastMathFlags &FMF) {
1050   AllowReassoc = FMF.allowReassoc();
1051   NoNaNs = FMF.noNaNs();
1052   NoInfs = FMF.noInfs();
1053   NoSignedZeros = FMF.noSignedZeros();
1054   AllowReciprocal = FMF.allowReciprocal();
1055   AllowContract = FMF.allowContract();
1056   ApproxFunc = FMF.approxFunc();
1057 }
1058 
1059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1060 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1061   switch (OpType) {
1062   case OperationType::Cmp:
1063     O << " " << CmpInst::getPredicateName(getPredicate());
1064     break;
1065   case OperationType::DisjointOp:
1066     if (DisjointFlags.IsDisjoint)
1067       O << " disjoint";
1068     break;
1069   case OperationType::PossiblyExactOp:
1070     if (ExactFlags.IsExact)
1071       O << " exact";
1072     break;
1073   case OperationType::OverflowingBinOp:
1074     if (WrapFlags.HasNUW)
1075       O << " nuw";
1076     if (WrapFlags.HasNSW)
1077       O << " nsw";
1078     break;
1079   case OperationType::FPMathOp:
1080     getFastMathFlags().print(O);
1081     break;
1082   case OperationType::GEPOp:
1083     if (GEPFlags.IsInBounds)
1084       O << " inbounds";
1085     break;
1086   case OperationType::NonNegOp:
1087     if (NonNegFlags.NonNeg)
1088       O << " nneg";
1089     break;
1090   case OperationType::Other:
1091     break;
1092   }
1093   if (getNumOperands() > 0)
1094     O << " ";
1095 }
1096 #endif
1097 
1098 void VPWidenRecipe::execute(VPTransformState &State) {
1099   State.setDebugLocFrom(getDebugLoc());
1100   auto &Builder = State.Builder;
1101   switch (Opcode) {
1102   case Instruction::Call:
1103   case Instruction::Br:
1104   case Instruction::PHI:
1105   case Instruction::GetElementPtr:
1106   case Instruction::Select:
1107     llvm_unreachable("This instruction is handled by a different recipe.");
1108   case Instruction::UDiv:
1109   case Instruction::SDiv:
1110   case Instruction::SRem:
1111   case Instruction::URem:
1112   case Instruction::Add:
1113   case Instruction::FAdd:
1114   case Instruction::Sub:
1115   case Instruction::FSub:
1116   case Instruction::FNeg:
1117   case Instruction::Mul:
1118   case Instruction::FMul:
1119   case Instruction::FDiv:
1120   case Instruction::FRem:
1121   case Instruction::Shl:
1122   case Instruction::LShr:
1123   case Instruction::AShr:
1124   case Instruction::And:
1125   case Instruction::Or:
1126   case Instruction::Xor: {
1127     // Just widen unops and binops.
1128     for (unsigned Part = 0; Part < State.UF; ++Part) {
1129       SmallVector<Value *, 2> Ops;
1130       for (VPValue *VPOp : operands())
1131         Ops.push_back(State.get(VPOp, Part));
1132 
1133       Value *V = Builder.CreateNAryOp(Opcode, Ops);
1134 
1135       if (auto *VecOp = dyn_cast<Instruction>(V))
1136         setFlags(VecOp);
1137 
1138       // Use this vector value for all users of the original instruction.
1139       State.set(this, V, Part);
1140       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1141     }
1142 
1143     break;
1144   }
1145   case Instruction::Freeze: {
1146     for (unsigned Part = 0; Part < State.UF; ++Part) {
1147       Value *Op = State.get(getOperand(0), Part);
1148 
1149       Value *Freeze = Builder.CreateFreeze(Op);
1150       State.set(this, Freeze, Part);
1151     }
1152     break;
1153   }
1154   case Instruction::ICmp:
1155   case Instruction::FCmp: {
1156     // Widen compares. Generate vector compares.
1157     bool FCmp = Opcode == Instruction::FCmp;
1158     for (unsigned Part = 0; Part < State.UF; ++Part) {
1159       Value *A = State.get(getOperand(0), Part);
1160       Value *B = State.get(getOperand(1), Part);
1161       Value *C = nullptr;
1162       if (FCmp) {
1163         // Propagate fast math flags.
1164         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1165         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1166           Builder.setFastMathFlags(I->getFastMathFlags());
1167         C = Builder.CreateFCmp(getPredicate(), A, B);
1168       } else {
1169         C = Builder.CreateICmp(getPredicate(), A, B);
1170       }
1171       State.set(this, C, Part);
1172       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1173     }
1174 
1175     break;
1176   }
1177   default:
1178     // This instruction is not vectorized by simple widening.
1179     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1180                       << Instruction::getOpcodeName(Opcode));
1181     llvm_unreachable("Unhandled instruction!");
1182   } // end of switch.
1183 
1184 #if !defined(NDEBUG)
1185   // Verify that VPlan type inference results agree with the type of the
1186   // generated values.
1187   for (unsigned Part = 0; Part < State.UF; ++Part) {
1188     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1189                            State.VF) == State.get(this, Part)->getType() &&
1190            "inferred type and type from generated instructions do not match");
1191   }
1192 #endif
1193 }
1194 
1195 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1196                                            VPCostContext &Ctx) const {
1197   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1198   switch (Opcode) {
1199   case Instruction::FNeg: {
1200     Type *VectorTy =
1201         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1202     return Ctx.TTI.getArithmeticInstrCost(
1203         Opcode, VectorTy, CostKind,
1204         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1205         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1206   }
1207 
1208   case Instruction::UDiv:
1209   case Instruction::SDiv:
1210   case Instruction::SRem:
1211   case Instruction::URem:
1212     // More complex computation, let the legacy cost-model handle this for now.
1213     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1214   case Instruction::Add:
1215   case Instruction::FAdd:
1216   case Instruction::Sub:
1217   case Instruction::FSub:
1218   case Instruction::Mul:
1219   case Instruction::FMul:
1220   case Instruction::FDiv:
1221   case Instruction::FRem:
1222   case Instruction::Shl:
1223   case Instruction::LShr:
1224   case Instruction::AShr:
1225   case Instruction::And:
1226   case Instruction::Or:
1227   case Instruction::Xor: {
1228     VPValue *RHS = getOperand(1);
1229     // Certain instructions can be cheaper to vectorize if they have a constant
1230     // second vector operand. One example of this are shifts on x86.
1231     TargetTransformInfo::OperandValueInfo RHSInfo = {
1232         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1233     if (RHS->isLiveIn())
1234       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1235 
1236     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1237         getOperand(1)->isDefinedOutsideVectorRegions())
1238       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1239     Type *VectorTy =
1240         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1241     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1242 
1243     SmallVector<const Value *, 4> Operands;
1244     if (CtxI)
1245       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1246     return Ctx.TTI.getArithmeticInstrCost(
1247         Opcode, VectorTy, CostKind,
1248         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1249         RHSInfo, Operands, CtxI, &Ctx.TLI);
1250   }
1251   case Instruction::Freeze: {
1252     // This opcode is unknown. Assume that it is the same as 'mul'.
1253     Type *VectorTy =
1254         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1255     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1256   }
1257   case Instruction::ICmp:
1258   case Instruction::FCmp: {
1259     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1260     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1261     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1262                                       CostKind, CtxI);
1263   }
1264   default:
1265     llvm_unreachable("Unsupported opcode for instruction");
1266   }
1267 }
1268 
1269 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1270   unsigned Opcode = getOpcode();
1271   // TODO: Support other opcodes
1272   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1273     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1274 
1275   State.setDebugLocFrom(getDebugLoc());
1276   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1277                           "explicit vector length.");
1278 
1279   assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
1280          "VPWidenEVLRecipe should not be used for scalars");
1281 
1282   VPValue *EVL = getEVL();
1283   Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
1284   IRBuilderBase &BuilderIR = State.Builder;
1285   VectorBuilder Builder(BuilderIR);
1286   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1287 
1288   SmallVector<Value *, 4> Ops;
1289   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1290     VPValue *VPOp = getOperand(I);
1291     Ops.push_back(State.get(VPOp, 0));
1292   }
1293 
1294   Builder.setMask(Mask).setEVL(EVLArg);
1295   Value *VPInst =
1296       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1297   // Currently vp-intrinsics only accept FMF flags.
1298   // TODO: Enable other flags when support is added.
1299   if (isa<FPMathOperator>(VPInst))
1300     setFlags(cast<Instruction>(VPInst));
1301 
1302   State.set(this, VPInst, 0);
1303   State.addMetadata(VPInst,
1304                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1305 }
1306 
1307 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1308 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1309                           VPSlotTracker &SlotTracker) const {
1310   O << Indent << "WIDEN ";
1311   printAsOperand(O, SlotTracker);
1312   O << " = " << Instruction::getOpcodeName(Opcode);
1313   printFlags(O);
1314   printOperands(O, SlotTracker);
1315 }
1316 
1317 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1318                              VPSlotTracker &SlotTracker) const {
1319   O << Indent << "WIDEN-VP ";
1320   printAsOperand(O, SlotTracker);
1321   O << " = " << Instruction::getOpcodeName(getOpcode());
1322   printFlags(O);
1323   printOperands(O, SlotTracker);
1324 }
1325 #endif
1326 
1327 void VPWidenCastRecipe::execute(VPTransformState &State) {
1328   State.setDebugLocFrom(getDebugLoc());
1329   auto &Builder = State.Builder;
1330   /// Vectorize casts.
1331   assert(State.VF.isVector() && "Not vectorizing?");
1332   Type *DestTy = VectorType::get(getResultType(), State.VF);
1333   VPValue *Op = getOperand(0);
1334   for (unsigned Part = 0; Part < State.UF; ++Part) {
1335     if (Part > 0 && Op->isLiveIn()) {
1336       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1337       State.set(this, State.get(this, 0), Part);
1338       continue;
1339     }
1340     Value *A = State.get(Op, Part);
1341     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1342     State.set(this, Cast, Part);
1343     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1344   }
1345 }
1346 
1347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1348 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1349                               VPSlotTracker &SlotTracker) const {
1350   O << Indent << "WIDEN-CAST ";
1351   printAsOperand(O, SlotTracker);
1352   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1353   printFlags(O);
1354   printOperands(O, SlotTracker);
1355   O << " to " << *getResultType();
1356 }
1357 #endif
1358 
1359 /// This function adds
1360 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1361 /// to each vector element of Val. The sequence starts at StartIndex.
1362 /// \p Opcode is relevant for FP induction variable.
1363 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1364                             Instruction::BinaryOps BinOp, ElementCount VF,
1365                             IRBuilderBase &Builder) {
1366   assert(VF.isVector() && "only vector VFs are supported");
1367 
1368   // Create and check the types.
1369   auto *ValVTy = cast<VectorType>(Val->getType());
1370   ElementCount VLen = ValVTy->getElementCount();
1371 
1372   Type *STy = Val->getType()->getScalarType();
1373   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1374          "Induction Step must be an integer or FP");
1375   assert(Step->getType() == STy && "Step has wrong type");
1376 
1377   SmallVector<Constant *, 8> Indices;
1378 
1379   // Create a vector of consecutive numbers from zero to VF.
1380   VectorType *InitVecValVTy = ValVTy;
1381   if (STy->isFloatingPointTy()) {
1382     Type *InitVecValSTy =
1383         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1384     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1385   }
1386   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1387 
1388   // Splat the StartIdx
1389   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1390 
1391   if (STy->isIntegerTy()) {
1392     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1393     Step = Builder.CreateVectorSplat(VLen, Step);
1394     assert(Step->getType() == Val->getType() && "Invalid step vec");
1395     // FIXME: The newly created binary instructions should contain nsw/nuw
1396     // flags, which can be found from the original scalar operations.
1397     Step = Builder.CreateMul(InitVec, Step);
1398     return Builder.CreateAdd(Val, Step, "induction");
1399   }
1400 
1401   // Floating point induction.
1402   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1403          "Binary Opcode should be specified for FP induction");
1404   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1405   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1406 
1407   Step = Builder.CreateVectorSplat(VLen, Step);
1408   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1409   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1410 }
1411 
1412 /// A helper function that returns an integer or floating-point constant with
1413 /// value C.
1414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1415   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1416                            : ConstantFP::get(Ty, C);
1417 }
1418 
1419 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1420                                   ElementCount VF) {
1421   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1422   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1423   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1424   return B.CreateUIToFP(RuntimeVF, FTy);
1425 }
1426 
1427 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1428   assert(!State.Instance && "Int or FP induction being replicated.");
1429 
1430   Value *Start = getStartValue()->getLiveInIRValue();
1431   const InductionDescriptor &ID = getInductionDescriptor();
1432   TruncInst *Trunc = getTruncInst();
1433   IRBuilderBase &Builder = State.Builder;
1434   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1435   assert(State.VF.isVector() && "must have vector VF");
1436 
1437   // The value from the original loop to which we are mapping the new induction
1438   // variable.
1439   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1440 
1441   // Fast-math-flags propagate from the original induction instruction.
1442   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1443   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1444     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1445 
1446   // Now do the actual transformations, and start with fetching the step value.
1447   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1448 
1449   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1450          "Expected either an induction phi-node or a truncate of it!");
1451 
1452   // Construct the initial value of the vector IV in the vector loop preheader
1453   auto CurrIP = Builder.saveIP();
1454   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1455   Builder.SetInsertPoint(VectorPH->getTerminator());
1456   if (isa<TruncInst>(EntryVal)) {
1457     assert(Start->getType()->isIntegerTy() &&
1458            "Truncation requires an integer type");
1459     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1460     Step = Builder.CreateTrunc(Step, TruncType);
1461     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1462   }
1463 
1464   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1465   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1466   Value *SteppedStart = getStepVector(
1467       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1468 
1469   // We create vector phi nodes for both integer and floating-point induction
1470   // variables. Here, we determine the kind of arithmetic we will perform.
1471   Instruction::BinaryOps AddOp;
1472   Instruction::BinaryOps MulOp;
1473   if (Step->getType()->isIntegerTy()) {
1474     AddOp = Instruction::Add;
1475     MulOp = Instruction::Mul;
1476   } else {
1477     AddOp = ID.getInductionOpcode();
1478     MulOp = Instruction::FMul;
1479   }
1480 
1481   // Multiply the vectorization factor by the step using integer or
1482   // floating-point arithmetic as appropriate.
1483   Type *StepType = Step->getType();
1484   Value *RuntimeVF;
1485   if (Step->getType()->isFloatingPointTy())
1486     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1487   else
1488     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1489   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1490 
1491   // Create a vector splat to use in the induction update.
1492   //
1493   // FIXME: If the step is non-constant, we create the vector splat with
1494   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1495   //        handle a constant vector splat.
1496   Value *SplatVF = isa<Constant>(Mul)
1497                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1498                        : Builder.CreateVectorSplat(State.VF, Mul);
1499   Builder.restoreIP(CurrIP);
1500 
1501   // We may need to add the step a number of times, depending on the unroll
1502   // factor. The last of those goes into the PHI.
1503   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1504   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1505   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1506   Instruction *LastInduction = VecInd;
1507   for (unsigned Part = 0; Part < State.UF; ++Part) {
1508     State.set(this, LastInduction, Part);
1509 
1510     if (isa<TruncInst>(EntryVal))
1511       State.addMetadata(LastInduction, EntryVal);
1512 
1513     LastInduction = cast<Instruction>(
1514         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1515     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1516   }
1517 
1518   LastInduction->setName("vec.ind.next");
1519   VecInd->addIncoming(SteppedStart, VectorPH);
1520   // Add induction update using an incorrect block temporarily. The phi node
1521   // will be fixed after VPlan execution. Note that at this point the latch
1522   // block cannot be used, as it does not exist yet.
1523   // TODO: Model increment value in VPlan, by turning the recipe into a
1524   // multi-def and a subclass of VPHeaderPHIRecipe.
1525   VecInd->addIncoming(LastInduction, VectorPH);
1526 }
1527 
1528 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1529 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1530                                           VPSlotTracker &SlotTracker) const {
1531   O << Indent << "WIDEN-INDUCTION";
1532   if (getTruncInst()) {
1533     O << "\\l\"";
1534     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1535     O << " +\n" << Indent << "\"  ";
1536     getVPValue(0)->printAsOperand(O, SlotTracker);
1537   } else
1538     O << " " << VPlanIngredient(IV);
1539 
1540   O << ", ";
1541   getStepValue()->printAsOperand(O, SlotTracker);
1542 }
1543 #endif
1544 
1545 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1546   // The step may be defined by a recipe in the preheader (e.g. if it requires
1547   // SCEV expansion), but for the canonical induction the step is required to be
1548   // 1, which is represented as live-in.
1549   if (getStepValue()->getDefiningRecipe())
1550     return false;
1551   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1552   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1553   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1554   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1555          getScalarType() == CanIV->getScalarType();
1556 }
1557 
1558 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1559 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1560                               VPSlotTracker &SlotTracker) const {
1561   O << Indent;
1562   printAsOperand(O, SlotTracker);
1563   O << " = DERIVED-IV ";
1564   getStartValue()->printAsOperand(O, SlotTracker);
1565   O << " + ";
1566   getOperand(1)->printAsOperand(O, SlotTracker);
1567   O << " * ";
1568   getStepValue()->printAsOperand(O, SlotTracker);
1569 }
1570 #endif
1571 
1572 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1573   // Fast-math-flags propagate from the original induction instruction.
1574   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1575   if (hasFastMathFlags())
1576     State.Builder.setFastMathFlags(getFastMathFlags());
1577 
1578   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1579   /// variable on which to base the steps, \p Step is the size of the step.
1580 
1581   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1582   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1583   IRBuilderBase &Builder = State.Builder;
1584 
1585   // Ensure step has the same type as that of scalar IV.
1586   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1587   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1588 
1589   // We build scalar steps for both integer and floating-point induction
1590   // variables. Here, we determine the kind of arithmetic we will perform.
1591   Instruction::BinaryOps AddOp;
1592   Instruction::BinaryOps MulOp;
1593   if (BaseIVTy->isIntegerTy()) {
1594     AddOp = Instruction::Add;
1595     MulOp = Instruction::Mul;
1596   } else {
1597     AddOp = InductionOpcode;
1598     MulOp = Instruction::FMul;
1599   }
1600 
1601   // Determine the number of scalars we need to generate for each unroll
1602   // iteration.
1603   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1604   // Compute the scalar steps and save the results in State.
1605   Type *IntStepTy =
1606       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1607   Type *VecIVTy = nullptr;
1608   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1609   if (!FirstLaneOnly && State.VF.isScalable()) {
1610     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1611     UnitStepVec =
1612         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1613     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1614     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1615   }
1616 
1617   unsigned StartPart = 0;
1618   unsigned EndPart = State.UF;
1619   unsigned StartLane = 0;
1620   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1621   if (State.Instance) {
1622     StartPart = State.Instance->Part;
1623     EndPart = StartPart + 1;
1624     StartLane = State.Instance->Lane.getKnownLane();
1625     EndLane = StartLane + 1;
1626   }
1627   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1628     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1629 
1630     if (!FirstLaneOnly && State.VF.isScalable()) {
1631       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1632       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1633       if (BaseIVTy->isFloatingPointTy())
1634         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1635       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1636       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1637       State.set(this, Add, Part);
1638       // It's useful to record the lane values too for the known minimum number
1639       // of elements so we do those below. This improves the code quality when
1640       // trying to extract the first element, for example.
1641     }
1642 
1643     if (BaseIVTy->isFloatingPointTy())
1644       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1645 
1646     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1647       Value *StartIdx = Builder.CreateBinOp(
1648           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1649       // The step returned by `createStepForVF` is a runtime-evaluated value
1650       // when VF is scalable. Otherwise, it should be folded into a Constant.
1651       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1652              "Expected StartIdx to be folded to a constant when VF is not "
1653              "scalable");
1654       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1655       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1656       State.set(this, Add, VPIteration(Part, Lane));
1657     }
1658   }
1659 }
1660 
1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1663                                   VPSlotTracker &SlotTracker) const {
1664   O << Indent;
1665   printAsOperand(O, SlotTracker);
1666   O << " = SCALAR-STEPS ";
1667   printOperands(O, SlotTracker);
1668 }
1669 #endif
1670 
1671 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1672   assert(State.VF.isVector() && "not widening");
1673   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1674   // Construct a vector GEP by widening the operands of the scalar GEP as
1675   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1676   // results in a vector of pointers when at least one operand of the GEP
1677   // is vector-typed. Thus, to keep the representation compact, we only use
1678   // vector-typed operands for loop-varying values.
1679 
1680   if (areAllOperandsInvariant()) {
1681     // If we are vectorizing, but the GEP has only loop-invariant operands,
1682     // the GEP we build (by only using vector-typed operands for
1683     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1684     // produce a vector of pointers, we need to either arbitrarily pick an
1685     // operand to broadcast, or broadcast a clone of the original GEP.
1686     // Here, we broadcast a clone of the original.
1687     //
1688     // TODO: If at some point we decide to scalarize instructions having
1689     //       loop-invariant operands, this special case will no longer be
1690     //       required. We would add the scalarization decision to
1691     //       collectLoopScalars() and teach getVectorValue() to broadcast
1692     //       the lane-zero scalar value.
1693     SmallVector<Value *> Ops;
1694     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1695       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1696 
1697     auto *NewGEP =
1698         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1699                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1700     for (unsigned Part = 0; Part < State.UF; ++Part) {
1701       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1702       State.set(this, EntryPart, Part);
1703       State.addMetadata(EntryPart, GEP);
1704     }
1705   } else {
1706     // If the GEP has at least one loop-varying operand, we are sure to
1707     // produce a vector of pointers. But if we are only unrolling, we want
1708     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1709     // produce with the code below will be scalar (if VF == 1) or vector
1710     // (otherwise). Note that for the unroll-only case, we still maintain
1711     // values in the vector mapping with initVector, as we do for other
1712     // instructions.
1713     for (unsigned Part = 0; Part < State.UF; ++Part) {
1714       // The pointer operand of the new GEP. If it's loop-invariant, we
1715       // won't broadcast it.
1716       auto *Ptr = isPointerLoopInvariant()
1717                       ? State.get(getOperand(0), VPIteration(0, 0))
1718                       : State.get(getOperand(0), Part);
1719 
1720       // Collect all the indices for the new GEP. If any index is
1721       // loop-invariant, we won't broadcast it.
1722       SmallVector<Value *, 4> Indices;
1723       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1724         VPValue *Operand = getOperand(I);
1725         if (isIndexLoopInvariant(I - 1))
1726           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1727         else
1728           Indices.push_back(State.get(Operand, Part));
1729       }
1730 
1731       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1732       // but it should be a vector, otherwise.
1733       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1734                                              Indices, "", isInBounds());
1735       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1736              "NewGEP is not a pointer vector");
1737       State.set(this, NewGEP, Part);
1738       State.addMetadata(NewGEP, GEP);
1739     }
1740   }
1741 }
1742 
1743 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1744 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1745                              VPSlotTracker &SlotTracker) const {
1746   O << Indent << "WIDEN-GEP ";
1747   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1748   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1749     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1750 
1751   O << " ";
1752   printAsOperand(O, SlotTracker);
1753   O << " = getelementptr";
1754   printFlags(O);
1755   printOperands(O, SlotTracker);
1756 }
1757 #endif
1758 
1759 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1760   auto &Builder = State.Builder;
1761   State.setDebugLocFrom(getDebugLoc());
1762   for (unsigned Part = 0; Part < State.UF; ++Part) {
1763     // Calculate the pointer for the specific unroll-part.
1764     Value *PartPtr = nullptr;
1765     // Use i32 for the gep index type when the value is constant,
1766     // or query DataLayout for a more suitable index type otherwise.
1767     const DataLayout &DL =
1768         Builder.GetInsertBlock()->getDataLayout();
1769     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1770                         ? DL.getIndexType(IndexedTy->getPointerTo())
1771                         : Builder.getInt32Ty();
1772     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1773     bool InBounds = isInBounds();
1774     if (IsReverse) {
1775       // If the address is consecutive but reversed, then the
1776       // wide store needs to start at the last vector element.
1777       // RunTimeVF =  VScale * VF.getKnownMinValue()
1778       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1779       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1780       // NumElt = -Part * RunTimeVF
1781       Value *NumElt = Builder.CreateMul(
1782           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1783       // LastLane = 1 - RunTimeVF
1784       Value *LastLane =
1785           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1786       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1787       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1788     } else {
1789       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1790       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1791     }
1792 
1793     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1794   }
1795 }
1796 
1797 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1798 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1799                                   VPSlotTracker &SlotTracker) const {
1800   O << Indent;
1801   printAsOperand(O, SlotTracker);
1802   O << " = vector-pointer ";
1803   if (IsReverse)
1804     O << "(reverse) ";
1805 
1806   printOperands(O, SlotTracker);
1807 }
1808 #endif
1809 
1810 void VPBlendRecipe::execute(VPTransformState &State) {
1811   assert(isNormalized() && "Expected blend to be normalized!");
1812   State.setDebugLocFrom(getDebugLoc());
1813   // We know that all PHIs in non-header blocks are converted into
1814   // selects, so we don't have to worry about the insertion order and we
1815   // can just use the builder.
1816   // At this point we generate the predication tree. There may be
1817   // duplications since this is a simple recursive scan, but future
1818   // optimizations will clean it up.
1819 
1820   unsigned NumIncoming = getNumIncomingValues();
1821 
1822   // Generate a sequence of selects of the form:
1823   // SELECT(Mask3, In3,
1824   //        SELECT(Mask2, In2,
1825   //               SELECT(Mask1, In1,
1826   //                      In0)))
1827   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1828   // are essentially undef are taken from In0.
1829   VectorParts Entry(State.UF);
1830   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1831   for (unsigned In = 0; In < NumIncoming; ++In) {
1832     for (unsigned Part = 0; Part < State.UF; ++Part) {
1833       // We might have single edge PHIs (blocks) - use an identity
1834       // 'select' for the first PHI operand.
1835       Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1836       if (In == 0)
1837         Entry[Part] = In0; // Initialize with the first incoming value.
1838       else {
1839         // Select between the current value and the previous incoming edge
1840         // based on the incoming mask.
1841         Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1842         Entry[Part] =
1843             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1844       }
1845     }
1846   }
1847 
1848   for (unsigned Part = 0; Part < State.UF; ++Part)
1849     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1850 }
1851 
1852 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1853 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1854                           VPSlotTracker &SlotTracker) const {
1855   O << Indent << "BLEND ";
1856   printAsOperand(O, SlotTracker);
1857   O << " =";
1858   if (getNumIncomingValues() == 1) {
1859     // Not a User of any mask: not really blending, this is a
1860     // single-predecessor phi.
1861     O << " ";
1862     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1863   } else {
1864     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1865       O << " ";
1866       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1867       if (I == 0)
1868         continue;
1869       O << "/";
1870       getMask(I)->printAsOperand(O, SlotTracker);
1871     }
1872   }
1873 }
1874 #endif
1875 
1876 void VPReductionRecipe::execute(VPTransformState &State) {
1877   assert(!State.Instance && "Reduction being replicated.");
1878   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1879   RecurKind Kind = RdxDesc.getRecurrenceKind();
1880   // Propagate the fast-math flags carried by the underlying instruction.
1881   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1882   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1883   for (unsigned Part = 0; Part < State.UF; ++Part) {
1884     Value *NewVecOp = State.get(getVecOp(), Part);
1885     if (VPValue *Cond = getCondOp()) {
1886       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1887       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1888       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1889 
1890       Value *Start;
1891       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1892         Start = RdxDesc.getRecurrenceStartValue();
1893       else
1894         Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1895                                             RdxDesc.getFastMathFlags());
1896       if (State.VF.isVector())
1897         Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(),
1898                                                 Start);
1899 
1900       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1901       NewVecOp = Select;
1902     }
1903     Value *NewRed;
1904     Value *NextInChain;
1905     if (IsOrdered) {
1906       if (State.VF.isVector())
1907         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1908                                         PrevInChain);
1909       else
1910         NewRed = State.Builder.CreateBinOp(
1911             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1912             NewVecOp);
1913       PrevInChain = NewRed;
1914       NextInChain = NewRed;
1915     } else {
1916       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1917       NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1918       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1919         NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1920                                      NewRed, PrevInChain);
1921       else
1922         NextInChain = State.Builder.CreateBinOp(
1923             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
1924             PrevInChain);
1925     }
1926     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1927   }
1928 }
1929 
1930 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1931   assert(!State.Instance && "Reduction being replicated.");
1932   assert(State.UF == 1 &&
1933          "Expected only UF == 1 when vectorizing with explicit vector length.");
1934 
1935   auto &Builder = State.Builder;
1936   // Propagate the fast-math flags carried by the underlying instruction.
1937   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1938   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1939   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1940 
1941   RecurKind Kind = RdxDesc.getRecurrenceKind();
1942   Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1943   Value *VecOp = State.get(getVecOp(), 0);
1944   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1945 
1946   VectorBuilder VBuilder(Builder);
1947   VBuilder.setEVL(EVL);
1948   Value *Mask;
1949   // TODO: move the all-true mask generation into VectorBuilder.
1950   if (VPValue *CondOp = getCondOp())
1951     Mask = State.get(CondOp, 0);
1952   else
1953     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1954   VBuilder.setMask(Mask);
1955 
1956   Value *NewRed;
1957   if (isOrdered()) {
1958     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1959   } else {
1960     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
1961     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1962       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1963     else
1964       NewRed = Builder.CreateBinOp(
1965           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1966   }
1967   State.set(this, NewRed, 0, /*IsScalar*/ true);
1968 }
1969 
1970 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1971 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1972                               VPSlotTracker &SlotTracker) const {
1973   O << Indent << "REDUCE ";
1974   printAsOperand(O, SlotTracker);
1975   O << " = ";
1976   getChainOp()->printAsOperand(O, SlotTracker);
1977   O << " +";
1978   if (isa<FPMathOperator>(getUnderlyingInstr()))
1979     O << getUnderlyingInstr()->getFastMathFlags();
1980   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1981   getVecOp()->printAsOperand(O, SlotTracker);
1982   if (isConditional()) {
1983     O << ", ";
1984     getCondOp()->printAsOperand(O, SlotTracker);
1985   }
1986   O << ")";
1987   if (RdxDesc.IntermediateStore)
1988     O << " (with final reduction value stored in invariant address sank "
1989          "outside of loop)";
1990 }
1991 
1992 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1993                                  VPSlotTracker &SlotTracker) const {
1994   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1995   O << Indent << "REDUCE ";
1996   printAsOperand(O, SlotTracker);
1997   O << " = ";
1998   getChainOp()->printAsOperand(O, SlotTracker);
1999   O << " +";
2000   if (isa<FPMathOperator>(getUnderlyingInstr()))
2001     O << getUnderlyingInstr()->getFastMathFlags();
2002   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2003   getVecOp()->printAsOperand(O, SlotTracker);
2004   O << ", ";
2005   getEVL()->printAsOperand(O, SlotTracker);
2006   if (isConditional()) {
2007     O << ", ";
2008     getCondOp()->printAsOperand(O, SlotTracker);
2009   }
2010   O << ")";
2011   if (RdxDesc.IntermediateStore)
2012     O << " (with final reduction value stored in invariant address sank "
2013          "outside of loop)";
2014 }
2015 #endif
2016 
2017 bool VPReplicateRecipe::shouldPack() const {
2018   // Find if the recipe is used by a widened recipe via an intervening
2019   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2020   return any_of(users(), [](const VPUser *U) {
2021     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2022       return any_of(PredR->users(), [PredR](const VPUser *U) {
2023         return !U->usesScalars(PredR);
2024       });
2025     return false;
2026   });
2027 }
2028 
2029 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2030 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2031                               VPSlotTracker &SlotTracker) const {
2032   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2033 
2034   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2035     printAsOperand(O, SlotTracker);
2036     O << " = ";
2037   }
2038   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2039     O << "call";
2040     printFlags(O);
2041     O << "@" << CB->getCalledFunction()->getName() << "(";
2042     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2043                     O, [&O, &SlotTracker](VPValue *Op) {
2044                       Op->printAsOperand(O, SlotTracker);
2045                     });
2046     O << ")";
2047   } else {
2048     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2049     printFlags(O);
2050     printOperands(O, SlotTracker);
2051   }
2052 
2053   if (shouldPack())
2054     O << " (S->V)";
2055 }
2056 #endif
2057 
2058 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
2059 /// if it is either defined outside the vector region or its operand is known to
2060 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
2061 /// TODO: Uniformity should be associated with a VPValue and there should be a
2062 /// generic way to check.
2063 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
2064   return C->isDefinedOutsideVectorRegions() ||
2065          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
2066          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
2067 }
2068 
2069 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
2070   assert(vputils::onlyFirstLaneUsed(this) &&
2071          "Codegen only implemented for first lane.");
2072   switch (Opcode) {
2073   case Instruction::SExt:
2074   case Instruction::ZExt:
2075   case Instruction::Trunc: {
2076     // Note: SExt/ZExt not used yet.
2077     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
2078     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2079   }
2080   default:
2081     llvm_unreachable("opcode not implemented yet");
2082   }
2083 }
2084 
2085 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2086   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
2087   for (unsigned Part = 0; Part != State.UF; ++Part) {
2088     Value *Res;
2089     // Only generate a single instance, if the recipe is uniform across UFs and
2090     // VFs.
2091     if (Part > 0 && IsUniformAcrossVFsAndUFs)
2092       Res = State.get(this, VPIteration(0, 0));
2093     else
2094       Res = generate(State, Part);
2095     State.set(this, Res, VPIteration(Part, 0));
2096   }
2097 }
2098 
2099 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2100 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2101                                 VPSlotTracker &SlotTracker) const {
2102   O << Indent << "SCALAR-CAST ";
2103   printAsOperand(O, SlotTracker);
2104   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2105   printOperands(O, SlotTracker);
2106   O << " to " << *ResultTy;
2107 }
2108 #endif
2109 
2110 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2111   assert(State.Instance && "Branch on Mask works only on single instance.");
2112 
2113   unsigned Part = State.Instance->Part;
2114   unsigned Lane = State.Instance->Lane.getKnownLane();
2115 
2116   Value *ConditionBit = nullptr;
2117   VPValue *BlockInMask = getMask();
2118   if (BlockInMask) {
2119     ConditionBit = State.get(BlockInMask, Part);
2120     if (ConditionBit->getType()->isVectorTy())
2121       ConditionBit = State.Builder.CreateExtractElement(
2122           ConditionBit, State.Builder.getInt32(Lane));
2123   } else // Block in mask is all-one.
2124     ConditionBit = State.Builder.getTrue();
2125 
2126   // Replace the temporary unreachable terminator with a new conditional branch,
2127   // whose two destinations will be set later when they are created.
2128   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2129   assert(isa<UnreachableInst>(CurrentTerminator) &&
2130          "Expected to replace unreachable terminator with conditional branch.");
2131   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2132   CondBr->setSuccessor(0, nullptr);
2133   ReplaceInstWithInst(CurrentTerminator, CondBr);
2134 }
2135 
2136 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2137   assert(State.Instance && "Predicated instruction PHI works per instance.");
2138   Instruction *ScalarPredInst =
2139       cast<Instruction>(State.get(getOperand(0), *State.Instance));
2140   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2141   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2142   assert(PredicatingBB && "Predicated block has no single predecessor.");
2143   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2144          "operand must be VPReplicateRecipe");
2145 
2146   // By current pack/unpack logic we need to generate only a single phi node: if
2147   // a vector value for the predicated instruction exists at this point it means
2148   // the instruction has vector users only, and a phi for the vector value is
2149   // needed. In this case the recipe of the predicated instruction is marked to
2150   // also do that packing, thereby "hoisting" the insert-element sequence.
2151   // Otherwise, a phi node for the scalar value is needed.
2152   unsigned Part = State.Instance->Part;
2153   if (State.hasVectorValue(getOperand(0), Part)) {
2154     Value *VectorValue = State.get(getOperand(0), Part);
2155     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2156     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2157     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2158     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2159     if (State.hasVectorValue(this, Part))
2160       State.reset(this, VPhi, Part);
2161     else
2162       State.set(this, VPhi, Part);
2163     // NOTE: Currently we need to update the value of the operand, so the next
2164     // predicated iteration inserts its generated value in the correct vector.
2165     State.reset(getOperand(0), VPhi, Part);
2166   } else {
2167     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2168     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2169     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2170                      PredicatingBB);
2171     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2172     if (State.hasScalarValue(this, *State.Instance))
2173       State.reset(this, Phi, *State.Instance);
2174     else
2175       State.set(this, Phi, *State.Instance);
2176     // NOTE: Currently we need to update the value of the operand, so the next
2177     // predicated iteration inserts its generated value in the correct vector.
2178     State.reset(getOperand(0), Phi, *State.Instance);
2179   }
2180 }
2181 
2182 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2183 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2184                                 VPSlotTracker &SlotTracker) const {
2185   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2186   printAsOperand(O, SlotTracker);
2187   O << " = ";
2188   printOperands(O, SlotTracker);
2189 }
2190 #endif
2191 
2192 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2193                                                  VPCostContext &Ctx) const {
2194   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2195   const Align Alignment =
2196       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2197   unsigned AS =
2198       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2199   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2200 
2201   if (!Consecutive) {
2202     // TODO: Using the original IR may not be accurate.
2203     // Currently, ARM will use the underlying IR to calculate gather/scatter
2204     // instruction cost.
2205     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2206     assert(!Reverse &&
2207            "Inconsecutive memory access should not have the order.");
2208     return Ctx.TTI.getAddressComputationCost(Ty) +
2209            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2210                                           IsMasked, Alignment, CostKind,
2211                                           &Ingredient);
2212   }
2213 
2214   InstructionCost Cost = 0;
2215   if (IsMasked) {
2216     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2217                                           AS, CostKind);
2218   } else {
2219     TTI::OperandValueInfo OpInfo =
2220         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2221     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2222                                     CostKind, OpInfo, &Ingredient);
2223   }
2224   if (!Reverse)
2225     return Cost;
2226 
2227   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2228                                         cast<VectorType>(Ty), std::nullopt,
2229                                         CostKind, 0);
2230 }
2231 
2232 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2233   auto *LI = cast<LoadInst>(&Ingredient);
2234 
2235   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2236   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2237   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2238   bool CreateGather = !isConsecutive();
2239 
2240   auto &Builder = State.Builder;
2241   State.setDebugLocFrom(getDebugLoc());
2242   for (unsigned Part = 0; Part < State.UF; ++Part) {
2243     Value *NewLI;
2244     Value *Mask = nullptr;
2245     if (auto *VPMask = getMask()) {
2246       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2247       // of a null all-one mask is a null mask.
2248       Mask = State.get(VPMask, Part);
2249       if (isReverse())
2250         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2251     }
2252 
2253     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
2254     if (CreateGather) {
2255       NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2256                                          "wide.masked.gather");
2257     } else if (Mask) {
2258       NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2259                                        PoisonValue::get(DataTy),
2260                                        "wide.masked.load");
2261     } else {
2262       NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2263     }
2264     // Add metadata to the load, but setVectorValue to the reverse shuffle.
2265     State.addMetadata(NewLI, LI);
2266     if (Reverse)
2267       NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2268     State.set(this, NewLI, Part);
2269   }
2270 }
2271 
2272 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2273 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2274                               VPSlotTracker &SlotTracker) const {
2275   O << Indent << "WIDEN ";
2276   printAsOperand(O, SlotTracker);
2277   O << " = load ";
2278   printOperands(O, SlotTracker);
2279 }
2280 #endif
2281 
2282 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2283 /// dependence w/o affecting the result.
2284 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2285                                      Value *EVL, const Twine &Name) {
2286   VectorType *ValTy = cast<VectorType>(Operand->getType());
2287   Value *AllTrueMask =
2288       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2289   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2290                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2291 }
2292 
2293 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2294   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2295                           "explicit vector length.");
2296   auto *LI = cast<LoadInst>(&Ingredient);
2297 
2298   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2299   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2300   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2301   bool CreateGather = !isConsecutive();
2302 
2303   auto &Builder = State.Builder;
2304   State.setDebugLocFrom(getDebugLoc());
2305   CallInst *NewLI;
2306   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2307   Value *Addr = State.get(getAddr(), 0, !CreateGather);
2308   Value *Mask = nullptr;
2309   if (VPValue *VPMask = getMask()) {
2310     Mask = State.get(VPMask, 0);
2311     if (isReverse())
2312       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2313   } else {
2314     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2315   }
2316 
2317   if (CreateGather) {
2318     NewLI =
2319         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2320                                 nullptr, "wide.masked.gather");
2321   } else {
2322     VectorBuilder VBuilder(Builder);
2323     VBuilder.setEVL(EVL).setMask(Mask);
2324     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2325         Instruction::Load, DataTy, Addr, "vp.op.load"));
2326   }
2327   NewLI->addParamAttr(
2328       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2329   State.addMetadata(NewLI, LI);
2330   Instruction *Res = NewLI;
2331   if (isReverse())
2332     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2333   State.set(this, Res, 0);
2334 }
2335 
2336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2337 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2338                                  VPSlotTracker &SlotTracker) const {
2339   O << Indent << "WIDEN ";
2340   printAsOperand(O, SlotTracker);
2341   O << " = vp.load ";
2342   printOperands(O, SlotTracker);
2343 }
2344 #endif
2345 
2346 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2347   auto *SI = cast<StoreInst>(&Ingredient);
2348 
2349   VPValue *StoredVPValue = getStoredValue();
2350   bool CreateScatter = !isConsecutive();
2351   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2352 
2353   auto &Builder = State.Builder;
2354   State.setDebugLocFrom(getDebugLoc());
2355 
2356   for (unsigned Part = 0; Part < State.UF; ++Part) {
2357     Instruction *NewSI = nullptr;
2358     Value *Mask = nullptr;
2359     if (auto *VPMask = getMask()) {
2360       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2361       // of a null all-one mask is a null mask.
2362       Mask = State.get(VPMask, Part);
2363       if (isReverse())
2364         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2365     }
2366 
2367     Value *StoredVal = State.get(StoredVPValue, Part);
2368     if (isReverse()) {
2369       // If we store to reverse consecutive memory locations, then we need
2370       // to reverse the order of elements in the stored value.
2371       StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2372       // We don't want to update the value in the map as it might be used in
2373       // another expression. So don't call resetVectorValue(StoredVal).
2374     }
2375     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
2376     if (CreateScatter)
2377       NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2378     else if (Mask)
2379       NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2380     else
2381       NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2382     State.addMetadata(NewSI, SI);
2383   }
2384 }
2385 
2386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2387 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2388                                VPSlotTracker &SlotTracker) const {
2389   O << Indent << "WIDEN store ";
2390   printOperands(O, SlotTracker);
2391 }
2392 #endif
2393 
2394 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2395   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2396                           "explicit vector length.");
2397   auto *SI = cast<StoreInst>(&Ingredient);
2398 
2399   VPValue *StoredValue = getStoredValue();
2400   bool CreateScatter = !isConsecutive();
2401   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2402 
2403   auto &Builder = State.Builder;
2404   State.setDebugLocFrom(getDebugLoc());
2405 
2406   CallInst *NewSI = nullptr;
2407   Value *StoredVal = State.get(StoredValue, 0);
2408   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2409   if (isReverse())
2410     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2411   Value *Mask = nullptr;
2412   if (VPValue *VPMask = getMask()) {
2413     Mask = State.get(VPMask, 0);
2414     if (isReverse())
2415       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2416   } else {
2417     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2418   }
2419   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
2420   if (CreateScatter) {
2421     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2422                                     Intrinsic::vp_scatter,
2423                                     {StoredVal, Addr, Mask, EVL});
2424   } else {
2425     VectorBuilder VBuilder(Builder);
2426     VBuilder.setEVL(EVL).setMask(Mask);
2427     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2428         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2429         {StoredVal, Addr}));
2430   }
2431   NewSI->addParamAttr(
2432       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2433   State.addMetadata(NewSI, SI);
2434 }
2435 
2436 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2437 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2438                                   VPSlotTracker &SlotTracker) const {
2439   O << Indent << "WIDEN vp.store ";
2440   printOperands(O, SlotTracker);
2441 }
2442 #endif
2443 
2444 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2445                                      VectorType *DstVTy, const DataLayout &DL) {
2446   // Verify that V is a vector type with same number of elements as DstVTy.
2447   auto VF = DstVTy->getElementCount();
2448   auto *SrcVecTy = cast<VectorType>(V->getType());
2449   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2450   Type *SrcElemTy = SrcVecTy->getElementType();
2451   Type *DstElemTy = DstVTy->getElementType();
2452   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2453          "Vector elements must have same size");
2454 
2455   // Do a direct cast if element types are castable.
2456   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2457     return Builder.CreateBitOrPointerCast(V, DstVTy);
2458   }
2459   // V cannot be directly casted to desired vector type.
2460   // May happen when V is a floating point vector but DstVTy is a vector of
2461   // pointers or vice-versa. Handle this using a two-step bitcast using an
2462   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2463   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2464          "Only one type should be a pointer type");
2465   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2466          "Only one type should be a floating point type");
2467   Type *IntTy =
2468       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2469   auto *VecIntTy = VectorType::get(IntTy, VF);
2470   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2471   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2472 }
2473 
2474 /// Return a vector containing interleaved elements from multiple
2475 /// smaller input vectors.
2476 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2477                                 const Twine &Name) {
2478   unsigned Factor = Vals.size();
2479   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2480 
2481   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2482 #ifndef NDEBUG
2483   for (Value *Val : Vals)
2484     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2485 #endif
2486 
2487   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2488   // must use intrinsics to interleave.
2489   if (VecTy->isScalableTy()) {
2490     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2491     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2492                                    Vals,
2493                                    /*FMFSource=*/nullptr, Name);
2494   }
2495 
2496   // Fixed length. Start by concatenating all vectors into a wide vector.
2497   Value *WideVec = concatenateVectors(Builder, Vals);
2498 
2499   // Interleave the elements into the wide vector.
2500   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2501   return Builder.CreateShuffleVector(
2502       WideVec, createInterleaveMask(NumElts, Factor), Name);
2503 }
2504 
2505 // Try to vectorize the interleave group that \p Instr belongs to.
2506 //
2507 // E.g. Translate following interleaved load group (factor = 3):
2508 //   for (i = 0; i < N; i+=3) {
2509 //     R = Pic[i];             // Member of index 0
2510 //     G = Pic[i+1];           // Member of index 1
2511 //     B = Pic[i+2];           // Member of index 2
2512 //     ... // do something to R, G, B
2513 //   }
2514 // To:
2515 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2516 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2517 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2518 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2519 //
2520 // Or translate following interleaved store group (factor = 3):
2521 //   for (i = 0; i < N; i+=3) {
2522 //     ... do something to R, G, B
2523 //     Pic[i]   = R;           // Member of index 0
2524 //     Pic[i+1] = G;           // Member of index 1
2525 //     Pic[i+2] = B;           // Member of index 2
2526 //   }
2527 // To:
2528 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2529 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2530 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2531 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2532 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2533 void VPInterleaveRecipe::execute(VPTransformState &State) {
2534   assert(!State.Instance && "Interleave group being replicated.");
2535   const InterleaveGroup<Instruction> *Group = IG;
2536   Instruction *Instr = Group->getInsertPos();
2537 
2538   // Prepare for the vector type of the interleaved load/store.
2539   Type *ScalarTy = getLoadStoreType(Instr);
2540   unsigned InterleaveFactor = Group->getFactor();
2541   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2542 
2543   // Prepare for the new pointers.
2544   SmallVector<Value *, 2> AddrParts;
2545   unsigned Index = Group->getIndex(Instr);
2546 
2547   // TODO: extend the masked interleaved-group support to reversed access.
2548   VPValue *BlockInMask = getMask();
2549   assert((!BlockInMask || !Group->isReverse()) &&
2550          "Reversed masked interleave-group not supported.");
2551 
2552   Value *Idx;
2553   // If the group is reverse, adjust the index to refer to the last vector lane
2554   // instead of the first. We adjust the index from the first vector lane,
2555   // rather than directly getting the pointer for lane VF - 1, because the
2556   // pointer operand of the interleaved access is supposed to be uniform. For
2557   // uniform instructions, we're only required to generate a value for the
2558   // first vector lane in each unroll iteration.
2559   if (Group->isReverse()) {
2560     Value *RuntimeVF =
2561         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2562     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2563     Idx = State.Builder.CreateMul(Idx,
2564                                   State.Builder.getInt32(Group->getFactor()));
2565     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2566     Idx = State.Builder.CreateNeg(Idx);
2567   } else
2568     Idx = State.Builder.getInt32(-Index);
2569 
2570   VPValue *Addr = getAddr();
2571   for (unsigned Part = 0; Part < State.UF; Part++) {
2572     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2573     if (auto *I = dyn_cast<Instruction>(AddrPart))
2574       State.setDebugLocFrom(I->getDebugLoc());
2575 
2576     // Notice current instruction could be any index. Need to adjust the address
2577     // to the member of index 0.
2578     //
2579     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2580     //       b = A[i];       // Member of index 0
2581     // Current pointer is pointed to A[i+1], adjust it to A[i].
2582     //
2583     // E.g.  A[i+1] = a;     // Member of index 1
2584     //       A[i]   = b;     // Member of index 0
2585     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2586     // Current pointer is pointed to A[i+2], adjust it to A[i].
2587 
2588     bool InBounds = false;
2589     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2590       InBounds = gep->isInBounds();
2591     AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2592     AddrParts.push_back(AddrPart);
2593   }
2594 
2595   State.setDebugLocFrom(Instr->getDebugLoc());
2596   Value *PoisonVec = PoisonValue::get(VecTy);
2597 
2598   auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2599                              unsigned Part, Value *MaskForGaps) -> Value * {
2600     if (State.VF.isScalable()) {
2601       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2602       assert(InterleaveFactor == 2 &&
2603              "Unsupported deinterleave factor for scalable vectors");
2604       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2605       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2606       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2607                                      State.VF.getKnownMinValue() * 2, true);
2608       return State.Builder.CreateIntrinsic(
2609           MaskTy, Intrinsic::vector_interleave2, Ops,
2610           /*FMFSource=*/nullptr, "interleaved.mask");
2611     }
2612 
2613     if (!BlockInMask)
2614       return MaskForGaps;
2615 
2616     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2617     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2618         BlockInMaskPart,
2619         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2620         "interleaved.mask");
2621     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2622                                                    ShuffledMask, MaskForGaps)
2623                        : ShuffledMask;
2624   };
2625 
2626   const DataLayout &DL = Instr->getDataLayout();
2627   // Vectorize the interleaved load group.
2628   if (isa<LoadInst>(Instr)) {
2629     Value *MaskForGaps = nullptr;
2630     if (NeedsMaskForGaps) {
2631       MaskForGaps = createBitMaskForGaps(State.Builder,
2632                                          State.VF.getKnownMinValue(), *Group);
2633       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2634     }
2635 
2636     // For each unroll part, create a wide load for the group.
2637     SmallVector<Value *, 2> NewLoads;
2638     for (unsigned Part = 0; Part < State.UF; Part++) {
2639       Instruction *NewLoad;
2640       if (BlockInMask || MaskForGaps) {
2641         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2642         NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2643                                                  Group->getAlign(), GroupMask,
2644                                                  PoisonVec, "wide.masked.vec");
2645       } else
2646         NewLoad = State.Builder.CreateAlignedLoad(
2647             VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2648       Group->addMetadata(NewLoad);
2649       NewLoads.push_back(NewLoad);
2650     }
2651 
2652     ArrayRef<VPValue *> VPDefs = definedValues();
2653     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2654     if (VecTy->isScalableTy()) {
2655       assert(InterleaveFactor == 2 &&
2656              "Unsupported deinterleave factor for scalable vectors");
2657 
2658       for (unsigned Part = 0; Part < State.UF; ++Part) {
2659         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2660         // so must use intrinsics to deinterleave.
2661         Value *DI = State.Builder.CreateIntrinsic(
2662             Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2663             /*FMFSource=*/nullptr, "strided.vec");
2664         unsigned J = 0;
2665         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2666           Instruction *Member = Group->getMember(I);
2667 
2668           if (!Member)
2669             continue;
2670 
2671           Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2672           // If this member has different type, cast the result type.
2673           if (Member->getType() != ScalarTy) {
2674             VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2675             StridedVec =
2676                 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2677           }
2678 
2679           if (Group->isReverse())
2680             StridedVec =
2681                 State.Builder.CreateVectorReverse(StridedVec, "reverse");
2682 
2683           State.set(VPDefs[J], StridedVec, Part);
2684           ++J;
2685         }
2686       }
2687 
2688       return;
2689     }
2690 
2691     // For each member in the group, shuffle out the appropriate data from the
2692     // wide loads.
2693     unsigned J = 0;
2694     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2695       Instruction *Member = Group->getMember(I);
2696 
2697       // Skip the gaps in the group.
2698       if (!Member)
2699         continue;
2700 
2701       auto StrideMask =
2702           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2703       for (unsigned Part = 0; Part < State.UF; Part++) {
2704         Value *StridedVec = State.Builder.CreateShuffleVector(
2705             NewLoads[Part], StrideMask, "strided.vec");
2706 
2707         // If this member has different type, cast the result type.
2708         if (Member->getType() != ScalarTy) {
2709           assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2710           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2711           StridedVec =
2712               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2713         }
2714 
2715         if (Group->isReverse())
2716           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2717 
2718         State.set(VPDefs[J], StridedVec, Part);
2719       }
2720       ++J;
2721     }
2722     return;
2723   }
2724 
2725   // The sub vector type for current instruction.
2726   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2727 
2728   // Vectorize the interleaved store group.
2729   Value *MaskForGaps =
2730       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2731   assert((!MaskForGaps || !State.VF.isScalable()) &&
2732          "masking gaps for scalable vectors is not yet supported.");
2733   ArrayRef<VPValue *> StoredValues = getStoredValues();
2734   for (unsigned Part = 0; Part < State.UF; Part++) {
2735     // Collect the stored vector from each member.
2736     SmallVector<Value *, 4> StoredVecs;
2737     unsigned StoredIdx = 0;
2738     for (unsigned i = 0; i < InterleaveFactor; i++) {
2739       assert((Group->getMember(i) || MaskForGaps) &&
2740              "Fail to get a member from an interleaved store group");
2741       Instruction *Member = Group->getMember(i);
2742 
2743       // Skip the gaps in the group.
2744       if (!Member) {
2745         Value *Undef = PoisonValue::get(SubVT);
2746         StoredVecs.push_back(Undef);
2747         continue;
2748       }
2749 
2750       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2751       ++StoredIdx;
2752 
2753       if (Group->isReverse())
2754         StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2755 
2756       // If this member has different type, cast it to a unified type.
2757 
2758       if (StoredVec->getType() != SubVT)
2759         StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2760 
2761       StoredVecs.push_back(StoredVec);
2762     }
2763 
2764     // Interleave all the smaller vectors into one wider vector.
2765     Value *IVec =
2766         interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2767     Instruction *NewStoreInstr;
2768     if (BlockInMask || MaskForGaps) {
2769       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2770       NewStoreInstr = State.Builder.CreateMaskedStore(
2771           IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2772     } else
2773       NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2774                                                        Group->getAlign());
2775 
2776     Group->addMetadata(NewStoreInstr);
2777   }
2778 }
2779 
2780 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2781 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2782                                VPSlotTracker &SlotTracker) const {
2783   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2784   IG->getInsertPos()->printAsOperand(O, false);
2785   O << ", ";
2786   getAddr()->printAsOperand(O, SlotTracker);
2787   VPValue *Mask = getMask();
2788   if (Mask) {
2789     O << ", ";
2790     Mask->printAsOperand(O, SlotTracker);
2791   }
2792 
2793   unsigned OpIdx = 0;
2794   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2795     if (!IG->getMember(i))
2796       continue;
2797     if (getNumStoreOperands() > 0) {
2798       O << "\n" << Indent << "  store ";
2799       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2800       O << " to index " << i;
2801     } else {
2802       O << "\n" << Indent << "  ";
2803       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2804       O << " = load from index " << i;
2805     }
2806     ++OpIdx;
2807   }
2808 }
2809 #endif
2810 
2811 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2812   Value *Start = getStartValue()->getLiveInIRValue();
2813   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2814   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2815 
2816   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2817   EntryPart->addIncoming(Start, VectorPH);
2818   EntryPart->setDebugLoc(getDebugLoc());
2819   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2820     State.set(this, EntryPart, Part, /*IsScalar*/ true);
2821 }
2822 
2823 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2824 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2825                                    VPSlotTracker &SlotTracker) const {
2826   O << Indent << "EMIT ";
2827   printAsOperand(O, SlotTracker);
2828   O << " = CANONICAL-INDUCTION ";
2829   printOperands(O, SlotTracker);
2830 }
2831 #endif
2832 
2833 bool VPCanonicalIVPHIRecipe::isCanonical(
2834     InductionDescriptor::InductionKind Kind, VPValue *Start,
2835     VPValue *Step) const {
2836   // Must be an integer induction.
2837   if (Kind != InductionDescriptor::IK_IntInduction)
2838     return false;
2839   // Start must match the start value of this canonical induction.
2840   if (Start != getStartValue())
2841     return false;
2842 
2843   // If the step is defined by a recipe, it is not a ConstantInt.
2844   if (Step->getDefiningRecipe())
2845     return false;
2846 
2847   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2848   return StepC && StepC->isOne();
2849 }
2850 
2851 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2852   return IsScalarAfterVectorization &&
2853          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2854 }
2855 
2856 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2857   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2858          "Not a pointer induction according to InductionDescriptor!");
2859   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2860          "Unexpected type.");
2861   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2862          "Recipe should have been replaced");
2863 
2864   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2865   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
2866   Type *PhiType = IndDesc.getStep()->getType();
2867 
2868   // Build a pointer phi
2869   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2870   Type *ScStValueType = ScalarStartValue->getType();
2871   PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2872                                            CanonicalIV->getIterator());
2873 
2874   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2875   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2876 
2877   // A pointer induction, performed by using a gep
2878   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2879 
2880   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
2881   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2882   Value *NumUnrolledElems =
2883       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
2884   Value *InductionGEP = GetElementPtrInst::Create(
2885       State.Builder.getInt8Ty(), NewPointerPhi,
2886       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2887       InductionLoc);
2888   // Add induction update using an incorrect block temporarily. The phi node
2889   // will be fixed after VPlan execution. Note that at this point the latch
2890   // block cannot be used, as it does not exist yet.
2891   // TODO: Model increment value in VPlan, by turning the recipe into a
2892   // multi-def and a subclass of VPHeaderPHIRecipe.
2893   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2894 
2895   // Create UF many actual address geps that use the pointer
2896   // phi as base and a vectorized version of the step value
2897   // (<step*0, ..., step*N>) as offset.
2898   for (unsigned Part = 0; Part < State.UF; ++Part) {
2899     Type *VecPhiType = VectorType::get(PhiType, State.VF);
2900     Value *StartOffsetScalar =
2901         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
2902     Value *StartOffset =
2903         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2904     // Create a vector of consecutive numbers from zero to VF.
2905     StartOffset = State.Builder.CreateAdd(
2906         StartOffset, State.Builder.CreateStepVector(VecPhiType));
2907 
2908     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
2909            "scalar step must be the same across all parts");
2910     Value *GEP = State.Builder.CreateGEP(
2911         State.Builder.getInt8Ty(), NewPointerPhi,
2912         State.Builder.CreateMul(
2913             StartOffset,
2914             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
2915             "vector.gep"));
2916     State.set(this, GEP, Part);
2917   }
2918 }
2919 
2920 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2921 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2922                                           VPSlotTracker &SlotTracker) const {
2923   O << Indent << "EMIT ";
2924   printAsOperand(O, SlotTracker);
2925   O << " = WIDEN-POINTER-INDUCTION ";
2926   getStartValue()->printAsOperand(O, SlotTracker);
2927   O << ", " << *IndDesc.getStep();
2928 }
2929 #endif
2930 
2931 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2932   assert(!State.Instance && "cannot be used in per-lane");
2933   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2934   SCEVExpander Exp(SE, DL, "induction");
2935 
2936   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2937                                  &*State.Builder.GetInsertPoint());
2938   assert(!State.ExpandedSCEVs.contains(Expr) &&
2939          "Same SCEV expanded multiple times");
2940   State.ExpandedSCEVs[Expr] = Res;
2941   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2942     State.set(this, Res, {Part, 0});
2943 }
2944 
2945 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2946 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2947                                VPSlotTracker &SlotTracker) const {
2948   O << Indent << "EMIT ";
2949   getVPSingleValue()->printAsOperand(O, SlotTracker);
2950   O << " = EXPAND SCEV " << *Expr;
2951 }
2952 #endif
2953 
2954 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2955   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2956   Type *STy = CanonicalIV->getType();
2957   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2958   ElementCount VF = State.VF;
2959   Value *VStart = VF.isScalar()
2960                       ? CanonicalIV
2961                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2962   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2963     Value *VStep = createStepForVF(Builder, STy, VF, Part);
2964     if (VF.isVector()) {
2965       VStep = Builder.CreateVectorSplat(VF, VStep);
2966       VStep =
2967           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2968     }
2969     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2970     State.set(this, CanonicalVectorIV, Part);
2971   }
2972 }
2973 
2974 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2975 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2976                                      VPSlotTracker &SlotTracker) const {
2977   O << Indent << "EMIT ";
2978   printAsOperand(O, SlotTracker);
2979   O << " = WIDEN-CANONICAL-INDUCTION ";
2980   printOperands(O, SlotTracker);
2981 }
2982 #endif
2983 
2984 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2985   auto &Builder = State.Builder;
2986   // Create a vector from the initial value.
2987   auto *VectorInit = getStartValue()->getLiveInIRValue();
2988 
2989   Type *VecTy = State.VF.isScalar()
2990                     ? VectorInit->getType()
2991                     : VectorType::get(VectorInit->getType(), State.VF);
2992 
2993   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2994   if (State.VF.isVector()) {
2995     auto *IdxTy = Builder.getInt32Ty();
2996     auto *One = ConstantInt::get(IdxTy, 1);
2997     IRBuilder<>::InsertPointGuard Guard(Builder);
2998     Builder.SetInsertPoint(VectorPH->getTerminator());
2999     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3000     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3001     VectorInit = Builder.CreateInsertElement(
3002         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3003   }
3004 
3005   // Create a phi node for the new recurrence.
3006   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
3007   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3008   EntryPart->addIncoming(VectorInit, VectorPH);
3009   State.set(this, EntryPart, 0);
3010 }
3011 
3012 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3013 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3014                                             VPSlotTracker &SlotTracker) const {
3015   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3016   printAsOperand(O, SlotTracker);
3017   O << " = phi ";
3018   printOperands(O, SlotTracker);
3019 }
3020 #endif
3021 
3022 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3023   auto &Builder = State.Builder;
3024 
3025   // Reductions do not have to start at zero. They can start with
3026   // any loop invariant values.
3027   VPValue *StartVPV = getStartValue();
3028   Value *StartV = StartVPV->getLiveInIRValue();
3029 
3030   // In order to support recurrences we need to be able to vectorize Phi nodes.
3031   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3032   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3033   // this value when we vectorize all of the instructions that use the PHI.
3034   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3035   Type *VecTy = ScalarPHI ? StartV->getType()
3036                           : VectorType::get(StartV->getType(), State.VF);
3037 
3038   BasicBlock *HeaderBB = State.CFG.PrevBB;
3039   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3040          "recipe must be in the vector loop header");
3041   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
3042   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3043     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
3044     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
3045     State.set(this, EntryPart, Part, IsInLoop);
3046   }
3047 
3048   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3049 
3050   Value *Iden = nullptr;
3051   RecurKind RK = RdxDesc.getRecurrenceKind();
3052   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3053       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3054     // MinMax and AnyOf reductions have the start value as their identity.
3055     if (ScalarPHI) {
3056       Iden = StartV;
3057     } else {
3058       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3059       Builder.SetInsertPoint(VectorPH->getTerminator());
3060       StartV = Iden =
3061           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
3062     }
3063   } else {
3064     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3065                                        RdxDesc.getFastMathFlags());
3066 
3067     if (!ScalarPHI) {
3068       Iden = Builder.CreateVectorSplat(State.VF, Iden);
3069       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3070       Builder.SetInsertPoint(VectorPH->getTerminator());
3071       Constant *Zero = Builder.getInt32(0);
3072       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3073     }
3074   }
3075 
3076   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3077     Value *EntryPart = State.get(this, Part, IsInLoop);
3078     // Make sure to add the reduction start value only to the
3079     // first unroll part.
3080     Value *StartVal = (Part == 0) ? StartV : Iden;
3081     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
3082   }
3083 }
3084 
3085 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3086 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3087                                  VPSlotTracker &SlotTracker) const {
3088   O << Indent << "WIDEN-REDUCTION-PHI ";
3089 
3090   printAsOperand(O, SlotTracker);
3091   O << " = phi ";
3092   printOperands(O, SlotTracker);
3093 }
3094 #endif
3095 
3096 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3097   assert(EnableVPlanNativePath &&
3098          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3099 
3100   Value *Op0 = State.get(getOperand(0), 0);
3101   Type *VecTy = Op0->getType();
3102   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3103   State.set(this, VecPhi, 0);
3104 }
3105 
3106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3107 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3108                              VPSlotTracker &SlotTracker) const {
3109   O << Indent << "WIDEN-PHI ";
3110 
3111   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3112   // Unless all incoming values are modeled in VPlan  print the original PHI
3113   // directly.
3114   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3115   // values as VPValues.
3116   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3117     O << VPlanIngredient(OriginalPhi);
3118     return;
3119   }
3120 
3121   printAsOperand(O, SlotTracker);
3122   O << " = phi ";
3123   printOperands(O, SlotTracker);
3124 }
3125 #endif
3126 
3127 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3128 // remove VPActiveLaneMaskPHIRecipe.
3129 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3130   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3131   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
3132     Value *StartMask = State.get(getOperand(0), Part);
3133     PHINode *EntryPart =
3134         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3135     EntryPart->addIncoming(StartMask, VectorPH);
3136     EntryPart->setDebugLoc(getDebugLoc());
3137     State.set(this, EntryPart, Part);
3138   }
3139 }
3140 
3141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3142 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3143                                       VPSlotTracker &SlotTracker) const {
3144   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3145 
3146   printAsOperand(O, SlotTracker);
3147   O << " = phi ";
3148   printOperands(O, SlotTracker);
3149 }
3150 #endif
3151 
3152 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3153   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3154   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
3155   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
3156   PHINode *EntryPart =
3157       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3158   EntryPart->addIncoming(Start, VectorPH);
3159   EntryPart->setDebugLoc(getDebugLoc());
3160   State.set(this, EntryPart, 0, /*IsScalar=*/true);
3161 }
3162 
3163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3164 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3165                                   VPSlotTracker &SlotTracker) const {
3166   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3167 
3168   printAsOperand(O, SlotTracker);
3169   O << " = phi ";
3170   printOperands(O, SlotTracker);
3171 }
3172 #endif
3173