xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision a794ee455999ed02671808c2c0ef08d753e811a1)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Type.h"
26 #include "llvm/IR/Value.h"
27 #include "llvm/IR/VectorBuilder.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
33 #include "llvm/Transforms/Utils/LoopUtils.h"
34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
35 #include <cassert>
36 
37 using namespace llvm;
38 
39 using VectorParts = SmallVector<Value *, 2>;
40 
41 namespace llvm {
42 extern cl::opt<bool> EnableVPlanNativePath;
43 }
44 extern cl::opt<unsigned> ForceTargetInstructionCost;
45 
46 #define LV_NAME "loop-vectorize"
47 #define DEBUG_TYPE LV_NAME
48 
49 bool VPRecipeBase::mayWriteToMemory() const {
50   switch (getVPDefID()) {
51   case VPInterleaveSC:
52     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
53   case VPWidenStoreEVLSC:
54   case VPWidenStoreSC:
55     return true;
56   case VPReplicateSC:
57     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
58         ->mayWriteToMemory();
59   case VPWidenCallSC:
60     return !cast<VPWidenCallRecipe>(this)
61                 ->getCalledScalarFunction()
62                 ->onlyReadsMemory();
63   case VPBranchOnMaskSC:
64   case VPScalarIVStepsSC:
65   case VPPredInstPHISC:
66     return false;
67   case VPBlendSC:
68   case VPReductionEVLSC:
69   case VPReductionSC:
70   case VPWidenCanonicalIVSC:
71   case VPWidenCastSC:
72   case VPWidenGEPSC:
73   case VPWidenIntOrFpInductionSC:
74   case VPWidenLoadEVLSC:
75   case VPWidenLoadSC:
76   case VPWidenPHISC:
77   case VPWidenSC:
78   case VPWidenEVLSC:
79   case VPWidenSelectSC: {
80     const Instruction *I =
81         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
82     (void)I;
83     assert((!I || !I->mayWriteToMemory()) &&
84            "underlying instruction may write to memory");
85     return false;
86   }
87   default:
88     return true;
89   }
90 }
91 
92 bool VPRecipeBase::mayReadFromMemory() const {
93   switch (getVPDefID()) {
94   case VPWidenLoadEVLSC:
95   case VPWidenLoadSC:
96     return true;
97   case VPReplicateSC:
98     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
99         ->mayReadFromMemory();
100   case VPWidenCallSC:
101     return !cast<VPWidenCallRecipe>(this)
102                 ->getCalledScalarFunction()
103                 ->onlyWritesMemory();
104   case VPBranchOnMaskSC:
105   case VPPredInstPHISC:
106   case VPScalarIVStepsSC:
107   case VPWidenStoreEVLSC:
108   case VPWidenStoreSC:
109     return false;
110   case VPBlendSC:
111   case VPReductionEVLSC:
112   case VPReductionSC:
113   case VPWidenCanonicalIVSC:
114   case VPWidenCastSC:
115   case VPWidenGEPSC:
116   case VPWidenIntOrFpInductionSC:
117   case VPWidenPHISC:
118   case VPWidenSC:
119   case VPWidenEVLSC:
120   case VPWidenSelectSC: {
121     const Instruction *I =
122         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123     (void)I;
124     assert((!I || !I->mayReadFromMemory()) &&
125            "underlying instruction may read from memory");
126     return false;
127   }
128   default:
129     return true;
130   }
131 }
132 
133 bool VPRecipeBase::mayHaveSideEffects() const {
134   switch (getVPDefID()) {
135   case VPDerivedIVSC:
136   case VPPredInstPHISC:
137   case VPScalarCastSC:
138     return false;
139   case VPInstructionSC:
140     switch (cast<VPInstruction>(this)->getOpcode()) {
141     case Instruction::Or:
142     case Instruction::ICmp:
143     case Instruction::Select:
144     case VPInstruction::Not:
145     case VPInstruction::CalculateTripCountMinusVF:
146     case VPInstruction::CanonicalIVIncrementForPart:
147     case VPInstruction::ExtractFromEnd:
148     case VPInstruction::FirstOrderRecurrenceSplice:
149     case VPInstruction::LogicalAnd:
150     case VPInstruction::PtrAdd:
151       return false;
152     default:
153       return true;
154     }
155   case VPWidenCallSC: {
156     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
157     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
158   }
159   case VPBlendSC:
160   case VPReductionEVLSC:
161   case VPReductionSC:
162   case VPScalarIVStepsSC:
163   case VPWidenCanonicalIVSC:
164   case VPWidenCastSC:
165   case VPWidenGEPSC:
166   case VPWidenIntOrFpInductionSC:
167   case VPWidenPHISC:
168   case VPWidenPointerInductionSC:
169   case VPWidenSC:
170   case VPWidenEVLSC:
171   case VPWidenSelectSC: {
172     const Instruction *I =
173         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
174     (void)I;
175     assert((!I || !I->mayHaveSideEffects()) &&
176            "underlying instruction has side-effects");
177     return false;
178   }
179   case VPInterleaveSC:
180     return mayWriteToMemory();
181   case VPWidenLoadEVLSC:
182   case VPWidenLoadSC:
183   case VPWidenStoreEVLSC:
184   case VPWidenStoreSC:
185     assert(
186         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
187             mayWriteToMemory() &&
188         "mayHaveSideffects result for ingredient differs from this "
189         "implementation");
190     return mayWriteToMemory();
191   case VPReplicateSC: {
192     auto *R = cast<VPReplicateRecipe>(this);
193     return R->getUnderlyingInstr()->mayHaveSideEffects();
194   }
195   default:
196     return true;
197   }
198 }
199 
200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
201   VPValue *ExitValue = getOperand(0);
202   VPBasicBlock *MiddleVPBB =
203       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
204   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
205   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
206   // Values leaving the vector loop reach live out phi's in the exiting block
207   // via middle block.
208   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
209                        ? MiddleVPBB
210                        : ExitingVPBB;
211   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
212   Value *V = State.get(ExitValue, VPIteration(0, 0));
213   if (Phi->getBasicBlockIndex(PredBB) != -1)
214     Phi->setIncomingValueForBlock(PredBB, V);
215   else
216     Phi->addIncoming(V, PredBB);
217 }
218 
219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
221   O << "Live-out ";
222   getPhi()->printAsOperand(O);
223   O << " = ";
224   getOperand(0)->printAsOperand(O, SlotTracker);
225   O << "\n";
226 }
227 #endif
228 
229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
230   assert(!Parent && "Recipe already in some VPBasicBlock");
231   assert(InsertPos->getParent() &&
232          "Insertion position not in any VPBasicBlock");
233   InsertPos->getParent()->insert(this, InsertPos->getIterator());
234 }
235 
236 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237                                 iplist<VPRecipeBase>::iterator I) {
238   assert(!Parent && "Recipe already in some VPBasicBlock");
239   assert(I == BB.end() || I->getParent() == &BB);
240   BB.insert(this, I);
241 }
242 
243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
244   assert(!Parent && "Recipe already in some VPBasicBlock");
245   assert(InsertPos->getParent() &&
246          "Insertion position not in any VPBasicBlock");
247   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
248 }
249 
250 void VPRecipeBase::removeFromParent() {
251   assert(getParent() && "Recipe not in any VPBasicBlock");
252   getParent()->getRecipeList().remove(getIterator());
253   Parent = nullptr;
254 }
255 
256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
257   assert(getParent() && "Recipe not in any VPBasicBlock");
258   return getParent()->getRecipeList().erase(getIterator());
259 }
260 
261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
262   removeFromParent();
263   insertAfter(InsertPos);
264 }
265 
266 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
267                               iplist<VPRecipeBase>::iterator I) {
268   removeFromParent();
269   insertBefore(BB, I);
270 }
271 
272 /// Return the underlying instruction to be used for computing \p R's cost via
273 /// the legacy cost model. Return nullptr if there's no suitable instruction.
274 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
275   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
276     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
277   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
278     return IG->getInsertPos();
279   // Currently the legacy cost model only calculates the instruction cost with
280   // underlying instruction. Removing the WidenMem here will prevent
281   // force-target-instruction-cost overwriting the cost of recipe with
282   // underlying instruction which is inconsistent with the legacy model.
283   // TODO: Remove WidenMem from this function when we don't need to compare to
284   // the legacy model.
285   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
286     return &WidenMem->getIngredient();
287   return nullptr;
288 }
289 
290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
291   auto *UI = getInstructionForCost(this);
292   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
293     return 0;
294 
295   InstructionCost RecipeCost = computeCost(VF, Ctx);
296   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
297       RecipeCost.isValid())
298     RecipeCost = InstructionCost(ForceTargetInstructionCost);
299 
300   LLVM_DEBUG({
301     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302     dump();
303   });
304   return RecipeCost;
305 }
306 
307 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
308                                           VPCostContext &Ctx) const {
309   // Compute the cost for the recipe falling back to the legacy cost model using
310   // the underlying instruction. If there is no underlying instruction, returns
311   // 0.
312   Instruction *UI = getInstructionForCost(this);
313   if (UI && isa<VPReplicateRecipe>(this)) {
314     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
315     // transform, avoid computing their cost multiple times for now.
316     Ctx.SkipCostComputation.insert(UI);
317   }
318   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
319 }
320 
321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
322   assert(OpType == OperationType::FPMathOp &&
323          "recipe doesn't have fast math flags");
324   FastMathFlags Res;
325   Res.setAllowReassoc(FMFs.AllowReassoc);
326   Res.setNoNaNs(FMFs.NoNaNs);
327   Res.setNoInfs(FMFs.NoInfs);
328   Res.setNoSignedZeros(FMFs.NoSignedZeros);
329   Res.setAllowReciprocal(FMFs.AllowReciprocal);
330   Res.setAllowContract(FMFs.AllowContract);
331   Res.setApproxFunc(FMFs.ApproxFunc);
332   return Res;
333 }
334 
335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
336                              VPValue *A, VPValue *B, DebugLoc DL,
337                              const Twine &Name)
338     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
339                           Pred, DL),
340       Opcode(Opcode), Name(Name.str()) {
341   assert(Opcode == Instruction::ICmp &&
342          "only ICmp predicates supported at the moment");
343 }
344 
345 VPInstruction::VPInstruction(unsigned Opcode,
346                              std::initializer_list<VPValue *> Operands,
347                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
348     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
349       Opcode(Opcode), Name(Name.str()) {
350   // Make sure the VPInstruction is a floating-point operation.
351   assert(isFPMathOp() && "this op can't take fast-math flags");
352 }
353 
354 bool VPInstruction::doesGeneratePerAllLanes() const {
355   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
356 }
357 
358 bool VPInstruction::canGenerateScalarForFirstLane() const {
359   if (Instruction::isBinaryOp(getOpcode()))
360     return true;
361   if (isSingleScalar() || isVectorToScalar())
362     return true;
363   switch (Opcode) {
364   case Instruction::ICmp:
365   case VPInstruction::BranchOnCond:
366   case VPInstruction::BranchOnCount:
367   case VPInstruction::CalculateTripCountMinusVF:
368   case VPInstruction::CanonicalIVIncrementForPart:
369   case VPInstruction::PtrAdd:
370   case VPInstruction::ExplicitVectorLength:
371     return true;
372   default:
373     return false;
374   }
375 }
376 
377 Value *VPInstruction::generatePerLane(VPTransformState &State,
378                                       const VPIteration &Lane) {
379   IRBuilderBase &Builder = State.Builder;
380 
381   assert(getOpcode() == VPInstruction::PtrAdd &&
382          "only PtrAdd opcodes are supported for now");
383   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
384                               State.get(getOperand(1), Lane), Name);
385 }
386 
387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
388   IRBuilderBase &Builder = State.Builder;
389 
390   if (Instruction::isBinaryOp(getOpcode())) {
391     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
392     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
393     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
394     auto *Res =
395         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
396     if (auto *I = dyn_cast<Instruction>(Res))
397       setFlags(I);
398     return Res;
399   }
400 
401   switch (getOpcode()) {
402   case VPInstruction::Not: {
403     Value *A = State.get(getOperand(0), Part);
404     return Builder.CreateNot(A, Name);
405   }
406   case Instruction::ICmp: {
407     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
408     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
409     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
410     return Builder.CreateCmp(getPredicate(), A, B, Name);
411   }
412   case Instruction::Select: {
413     Value *Cond = State.get(getOperand(0), Part);
414     Value *Op1 = State.get(getOperand(1), Part);
415     Value *Op2 = State.get(getOperand(2), Part);
416     return Builder.CreateSelect(Cond, Op1, Op2, Name);
417   }
418   case VPInstruction::ActiveLaneMask: {
419     // Get first lane of vector induction variable.
420     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
421     // Get the original loop tripcount.
422     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
423 
424     // If this part of the active lane mask is scalar, generate the CMP directly
425     // to avoid unnecessary extracts.
426     if (State.VF.isScalar())
427       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
428                                Name);
429 
430     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
431     auto *PredTy = VectorType::get(Int1Ty, State.VF);
432     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
433                                    {PredTy, ScalarTC->getType()},
434                                    {VIVElem0, ScalarTC}, nullptr, Name);
435   }
436   case VPInstruction::FirstOrderRecurrenceSplice: {
437     // Generate code to combine the previous and current values in vector v3.
438     //
439     //   vector.ph:
440     //     v_init = vector(..., ..., ..., a[-1])
441     //     br vector.body
442     //
443     //   vector.body
444     //     i = phi [0, vector.ph], [i+4, vector.body]
445     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
446     //     v2 = a[i, i+1, i+2, i+3];
447     //     v3 = vector(v1(3), v2(0, 1, 2))
448 
449     // For the first part, use the recurrence phi (v1), otherwise v2.
450     auto *V1 = State.get(getOperand(0), 0);
451     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
452     if (!PartMinus1->getType()->isVectorTy())
453       return PartMinus1;
454     Value *V2 = State.get(getOperand(1), Part);
455     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
456   }
457   case VPInstruction::CalculateTripCountMinusVF: {
458     if (Part != 0)
459       return State.get(this, 0, /*IsScalar*/ true);
460 
461     Value *ScalarTC = State.get(getOperand(0), {0, 0});
462     Value *Step =
463         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
464     Value *Sub = Builder.CreateSub(ScalarTC, Step);
465     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
466     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
467     return Builder.CreateSelect(Cmp, Sub, Zero);
468   }
469   case VPInstruction::ExplicitVectorLength: {
470     // Compute EVL
471     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
472       assert(AVL->getType()->isIntegerTy() &&
473              "Requested vector length should be an integer.");
474 
475       // TODO: Add support for MaxSafeDist for correct loop emission.
476       assert(State.VF.isScalable() && "Expected scalable vector factor.");
477       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
478 
479       Value *EVL = State.Builder.CreateIntrinsic(
480           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
481           {AVL, VFArg, State.Builder.getTrue()});
482       return EVL;
483     };
484     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
485     // be outside of the main loop.
486     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
487     // Compute VTC - IV as the AVL (requested vector length).
488     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
489     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
490     Value *AVL = State.Builder.CreateSub(TripCount, Index);
491     Value *EVL = GetEVL(State, AVL);
492     return EVL;
493   }
494   case VPInstruction::CanonicalIVIncrementForPart: {
495     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
496     if (Part == 0)
497       return IV;
498 
499     // The canonical IV is incremented by the vectorization factor (num of SIMD
500     // elements) times the unroll part.
501     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
502     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
503                              hasNoSignedWrap());
504   }
505   case VPInstruction::BranchOnCond: {
506     if (Part != 0)
507       return nullptr;
508 
509     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
510     // Replace the temporary unreachable terminator with a new conditional
511     // branch, hooking it up to backward destination for exiting blocks now and
512     // to forward destination(s) later when they are created.
513     BranchInst *CondBr =
514         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
515     CondBr->setSuccessor(0, nullptr);
516     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
517 
518     if (!getParent()->isExiting())
519       return CondBr;
520 
521     VPRegionBlock *ParentRegion = getParent()->getParent();
522     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
523     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
524     return CondBr;
525   }
526   case VPInstruction::BranchOnCount: {
527     if (Part != 0)
528       return nullptr;
529     // First create the compare.
530     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
531     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
532     Value *Cond = Builder.CreateICmpEQ(IV, TC);
533 
534     // Now create the branch.
535     auto *Plan = getParent()->getPlan();
536     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
537     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
538 
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination (the header) now and to the
541     // forward destination (the exit/middle block) later when it is created.
542     // Note that CreateCondBr expects a valid BB as first argument, so we need
543     // to set it to nullptr later.
544     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
545                                               State.CFG.VPBB2IRBB[Header]);
546     CondBr->setSuccessor(0, nullptr);
547     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
548     return CondBr;
549   }
550   case VPInstruction::ComputeReductionResult: {
551     if (Part != 0)
552       return State.get(this, 0, /*IsScalar*/ true);
553 
554     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
555     // and will be removed by breaking up the recipe further.
556     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
557     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
558     // Get its reduction variable descriptor.
559     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
560 
561     RecurKind RK = RdxDesc.getRecurrenceKind();
562 
563     VPValue *LoopExitingDef = getOperand(1);
564     Type *PhiTy = OrigPhi->getType();
565     VectorParts RdxParts(State.UF);
566     for (unsigned Part = 0; Part < State.UF; ++Part)
567       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
568 
569     // If the vector reduction can be performed in a smaller type, we truncate
570     // then extend the loop exit value to enable InstCombine to evaluate the
571     // entire expression in the smaller type.
572     // TODO: Handle this in truncateToMinBW.
573     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
574       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
575       for (unsigned Part = 0; Part < State.UF; ++Part)
576         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
577     }
578     // Reduce all of the unrolled parts into a single vector.
579     Value *ReducedPartRdx = RdxParts[0];
580     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
581     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
582       Op = Instruction::Or;
583 
584     if (PhiR->isOrdered()) {
585       ReducedPartRdx = RdxParts[State.UF - 1];
586     } else {
587       // Floating-point operations should have some FMF to enable the reduction.
588       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
589       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
590       for (unsigned Part = 1; Part < State.UF; ++Part) {
591         Value *RdxPart = RdxParts[Part];
592         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
593           ReducedPartRdx = Builder.CreateBinOp(
594               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
595         else
596           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
597       }
598     }
599 
600     // Create the reduction after the loop. Note that inloop reductions create
601     // the target reduction in the loop using a Reduction recipe.
602     if ((State.VF.isVector() ||
603          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
604         !PhiR->isInLoop()) {
605       ReducedPartRdx =
606           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
607       // If the reduction can be performed in a smaller type, we need to extend
608       // the reduction to the wider type before we branch to the original loop.
609       if (PhiTy != RdxDesc.getRecurrenceType())
610         ReducedPartRdx = RdxDesc.isSigned()
611                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
612                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
613     }
614 
615     // If there were stores of the reduction value to a uniform memory address
616     // inside the loop, create the final store here.
617     if (StoreInst *SI = RdxDesc.IntermediateStore) {
618       auto *NewSI = Builder.CreateAlignedStore(
619           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
620       propagateMetadata(NewSI, SI);
621     }
622 
623     return ReducedPartRdx;
624   }
625   case VPInstruction::ExtractFromEnd: {
626     if (Part != 0)
627       return State.get(this, 0, /*IsScalar*/ true);
628 
629     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
630     unsigned Offset = CI->getZExtValue();
631     assert(Offset > 0 && "Offset from end must be positive");
632     Value *Res;
633     if (State.VF.isVector()) {
634       assert(Offset <= State.VF.getKnownMinValue() &&
635              "invalid offset to extract from");
636       // Extract lane VF - Offset from the operand.
637       Res = State.get(
638           getOperand(0),
639           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
640     } else {
641       assert(Offset <= State.UF && "invalid offset to extract from");
642       // When loop is unrolled without vectorizing, retrieve UF - Offset.
643       Res = State.get(getOperand(0), State.UF - Offset);
644     }
645     if (isa<ExtractElementInst>(Res))
646       Res->setName(Name);
647     return Res;
648   }
649   case VPInstruction::LogicalAnd: {
650     Value *A = State.get(getOperand(0), Part);
651     Value *B = State.get(getOperand(1), Part);
652     return Builder.CreateLogicalAnd(A, B, Name);
653   }
654   case VPInstruction::PtrAdd: {
655     assert(vputils::onlyFirstLaneUsed(this) &&
656            "can only generate first lane for PtrAdd");
657     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
658     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
659     return Builder.CreatePtrAdd(Ptr, Addend, Name);
660   }
661   case VPInstruction::ResumePhi: {
662     if (Part != 0)
663       return State.get(this, 0, /*IsScalar*/ true);
664     Value *IncomingFromVPlanPred =
665         State.get(getOperand(0), Part, /* IsScalar */ true);
666     Value *IncomingFromOtherPreds =
667         State.get(getOperand(1), Part, /* IsScalar */ true);
668     auto *NewPhi =
669         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
670     BasicBlock *VPlanPred =
671         State.CFG
672             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
673     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
674     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
675       assert(OtherPred != VPlanPred &&
676              "VPlan predecessors should not be connected yet");
677       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
678     }
679     return NewPhi;
680   }
681 
682   default:
683     llvm_unreachable("Unsupported opcode for instruction");
684   }
685 }
686 
687 bool VPInstruction::isVectorToScalar() const {
688   return getOpcode() == VPInstruction::ExtractFromEnd ||
689          getOpcode() == VPInstruction::ComputeReductionResult;
690 }
691 
692 bool VPInstruction::isSingleScalar() const {
693   return getOpcode() == VPInstruction::ResumePhi;
694 }
695 
696 #if !defined(NDEBUG)
697 bool VPInstruction::isFPMathOp() const {
698   // Inspired by FPMathOperator::classof. Notable differences are that we don't
699   // support Call, PHI and Select opcodes here yet.
700   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
701          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
702          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
703          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
704 }
705 #endif
706 
707 void VPInstruction::execute(VPTransformState &State) {
708   assert(!State.Instance && "VPInstruction executing an Instance");
709   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
710   assert((hasFastMathFlags() == isFPMathOp() ||
711           getOpcode() == Instruction::Select) &&
712          "Recipe not a FPMathOp but has fast-math flags?");
713   if (hasFastMathFlags())
714     State.Builder.setFastMathFlags(getFastMathFlags());
715   State.setDebugLocFrom(getDebugLoc());
716   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
717                                    (vputils::onlyFirstLaneUsed(this) ||
718                                     isVectorToScalar() || isSingleScalar());
719   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
720   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
721   for (unsigned Part = 0; Part < State.UF; ++Part) {
722     if (GeneratesPerAllLanes) {
723       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
724            Lane != NumLanes; ++Lane) {
725         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
726         assert(GeneratedValue && "generatePerLane must produce a value");
727         State.set(this, GeneratedValue, VPIteration(Part, Lane));
728       }
729       continue;
730     }
731 
732     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
733       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
734       State.set(this, Part0, Part,
735                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
736       continue;
737     }
738 
739     Value *GeneratedValue = generatePerPart(State, Part);
740     if (!hasResult())
741       continue;
742     assert(GeneratedValue && "generatePerPart must produce a value");
743     assert((GeneratedValue->getType()->isVectorTy() ==
744                 !GeneratesPerFirstLaneOnly ||
745             State.VF.isScalar()) &&
746            "scalar value but not only first lane defined");
747     State.set(this, GeneratedValue, Part,
748               /*IsScalar*/ GeneratesPerFirstLaneOnly);
749   }
750 }
751 
752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
753   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
754   if (Instruction::isBinaryOp(getOpcode()))
755     return vputils::onlyFirstLaneUsed(this);
756 
757   switch (getOpcode()) {
758   default:
759     return false;
760   case Instruction::ICmp:
761   case VPInstruction::PtrAdd:
762     // TODO: Cover additional opcodes.
763     return vputils::onlyFirstLaneUsed(this);
764   case VPInstruction::ActiveLaneMask:
765   case VPInstruction::ExplicitVectorLength:
766   case VPInstruction::CalculateTripCountMinusVF:
767   case VPInstruction::CanonicalIVIncrementForPart:
768   case VPInstruction::BranchOnCount:
769   case VPInstruction::BranchOnCond:
770   case VPInstruction::ResumePhi:
771     return true;
772   };
773   llvm_unreachable("switch should return");
774 }
775 
776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
777   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
778   if (Instruction::isBinaryOp(getOpcode()))
779     return vputils::onlyFirstPartUsed(this);
780 
781   switch (getOpcode()) {
782   default:
783     return false;
784   case Instruction::ICmp:
785   case Instruction::Select:
786     return vputils::onlyFirstPartUsed(this);
787   case VPInstruction::BranchOnCount:
788   case VPInstruction::BranchOnCond:
789   case VPInstruction::CanonicalIVIncrementForPart:
790     return true;
791   };
792   llvm_unreachable("switch should return");
793 }
794 
795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
796 void VPInstruction::dump() const {
797   VPSlotTracker SlotTracker(getParent()->getPlan());
798   print(dbgs(), "", SlotTracker);
799 }
800 
801 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
802                           VPSlotTracker &SlotTracker) const {
803   O << Indent << "EMIT ";
804 
805   if (hasResult()) {
806     printAsOperand(O, SlotTracker);
807     O << " = ";
808   }
809 
810   switch (getOpcode()) {
811   case VPInstruction::Not:
812     O << "not";
813     break;
814   case VPInstruction::SLPLoad:
815     O << "combined load";
816     break;
817   case VPInstruction::SLPStore:
818     O << "combined store";
819     break;
820   case VPInstruction::ActiveLaneMask:
821     O << "active lane mask";
822     break;
823   case VPInstruction::ResumePhi:
824     O << "resume-phi";
825     break;
826   case VPInstruction::ExplicitVectorLength:
827     O << "EXPLICIT-VECTOR-LENGTH";
828     break;
829   case VPInstruction::FirstOrderRecurrenceSplice:
830     O << "first-order splice";
831     break;
832   case VPInstruction::BranchOnCond:
833     O << "branch-on-cond";
834     break;
835   case VPInstruction::CalculateTripCountMinusVF:
836     O << "TC > VF ? TC - VF : 0";
837     break;
838   case VPInstruction::CanonicalIVIncrementForPart:
839     O << "VF * Part +";
840     break;
841   case VPInstruction::BranchOnCount:
842     O << "branch-on-count";
843     break;
844   case VPInstruction::ExtractFromEnd:
845     O << "extract-from-end";
846     break;
847   case VPInstruction::ComputeReductionResult:
848     O << "compute-reduction-result";
849     break;
850   case VPInstruction::LogicalAnd:
851     O << "logical-and";
852     break;
853   case VPInstruction::PtrAdd:
854     O << "ptradd";
855     break;
856   default:
857     O << Instruction::getOpcodeName(getOpcode());
858   }
859 
860   printFlags(O);
861   printOperands(O, SlotTracker);
862 
863   if (auto DL = getDebugLoc()) {
864     O << ", !dbg ";
865     DL.print(O);
866   }
867 }
868 #endif
869 
870 void VPWidenCallRecipe::execute(VPTransformState &State) {
871   assert(State.VF.isVector() && "not widening");
872   Function *CalledScalarFn = getCalledScalarFunction();
873   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
874          "DbgInfoIntrinsic should have been dropped during VPlan construction");
875   State.setDebugLocFrom(getDebugLoc());
876 
877   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
878   FunctionType *VFTy = nullptr;
879   if (Variant)
880     VFTy = Variant->getFunctionType();
881   for (unsigned Part = 0; Part < State.UF; ++Part) {
882     SmallVector<Type *, 2> TysForDecl;
883     // Add return type if intrinsic is overloaded on it.
884     if (UseIntrinsic &&
885         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
886       TysForDecl.push_back(VectorType::get(
887           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
888     SmallVector<Value *, 4> Args;
889     for (const auto &I : enumerate(arg_operands())) {
890       // Some intrinsics have a scalar argument - don't replace it with a
891       // vector.
892       Value *Arg;
893       if (UseIntrinsic &&
894           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
895         Arg = State.get(I.value(), VPIteration(0, 0));
896       // Some vectorized function variants may also take a scalar argument,
897       // e.g. linear parameters for pointers. This needs to be the scalar value
898       // from the start of the respective part when interleaving.
899       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
900         Arg = State.get(I.value(), VPIteration(Part, 0));
901       else
902         Arg = State.get(I.value(), Part);
903       if (UseIntrinsic &&
904           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
905         TysForDecl.push_back(Arg->getType());
906       Args.push_back(Arg);
907     }
908 
909     Function *VectorF;
910     if (UseIntrinsic) {
911       // Use vector version of the intrinsic.
912       Module *M = State.Builder.GetInsertBlock()->getModule();
913       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
914       assert(VectorF && "Can't retrieve vector intrinsic.");
915     } else {
916 #ifndef NDEBUG
917       assert(Variant != nullptr && "Can't create vector function.");
918 #endif
919       VectorF = Variant;
920     }
921 
922     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
923     SmallVector<OperandBundleDef, 1> OpBundles;
924     if (CI)
925       CI->getOperandBundlesAsDefs(OpBundles);
926 
927     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
928 
929     if (isa<FPMathOperator>(V))
930       V->copyFastMathFlags(CI);
931 
932     if (!V->getType()->isVoidTy())
933       State.set(this, V, Part);
934     State.addMetadata(V, CI);
935   }
936 }
937 
938 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
939                                                VPCostContext &Ctx) const {
940   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
941   if (Variant) {
942     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
943                                     Variant->getFunctionType()->params(),
944                                     CostKind);
945   }
946 
947   FastMathFlags FMF;
948   // TODO: Manage flags via VPRecipeWithIRFlags.
949   if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
950     FMF = FPMO->getFastMathFlags();
951 
952   // Some backends analyze intrinsic arguments to determine cost. Use the
953   // underlying value for the operand if it has one. Otherwise try to use the
954   // operand of the underlying call instruction, if there is one. Otherwise
955   // clear Arguments.
956   // TODO: Rework TTI interface to be independent of concrete IR values.
957   SmallVector<const Value *> Arguments;
958   for (const auto &[Idx, Op] : enumerate(operands())) {
959     auto *V = Op->getUnderlyingValue();
960     if (!V) {
961       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
962         Arguments.push_back(UI->getArgOperand(Idx));
963         continue;
964       }
965       Arguments.clear();
966       break;
967     }
968     Arguments.push_back(V);
969   }
970 
971   Type *RetTy =
972       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
973   SmallVector<Type *> ParamTys;
974   for (unsigned I = 0; I != getNumOperands(); ++I)
975     ParamTys.push_back(
976         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
977 
978   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
979   IntrinsicCostAttributes CostAttrs(
980       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
981       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
982   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
983 }
984 
985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
986 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
987                               VPSlotTracker &SlotTracker) const {
988   O << Indent << "WIDEN-CALL ";
989 
990   Function *CalledFn = getCalledScalarFunction();
991   if (CalledFn->getReturnType()->isVoidTy())
992     O << "void ";
993   else {
994     printAsOperand(O, SlotTracker);
995     O << " = ";
996   }
997 
998   O << "call @" << CalledFn->getName() << "(";
999   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1000     Op->printAsOperand(O, SlotTracker);
1001   });
1002   O << ")";
1003 
1004   if (VectorIntrinsicID)
1005     O << " (using vector intrinsic)";
1006   else {
1007     O << " (using library function";
1008     if (Variant->hasName())
1009       O << ": " << Variant->getName();
1010     O << ")";
1011   }
1012 }
1013 
1014 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1015                                 VPSlotTracker &SlotTracker) const {
1016   O << Indent << "WIDEN-SELECT ";
1017   printAsOperand(O, SlotTracker);
1018   O << " = select ";
1019   getOperand(0)->printAsOperand(O, SlotTracker);
1020   O << ", ";
1021   getOperand(1)->printAsOperand(O, SlotTracker);
1022   O << ", ";
1023   getOperand(2)->printAsOperand(O, SlotTracker);
1024   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1025 }
1026 #endif
1027 
1028 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1029   State.setDebugLocFrom(getDebugLoc());
1030 
1031   // The condition can be loop invariant but still defined inside the
1032   // loop. This means that we can't just use the original 'cond' value.
1033   // We have to take the 'vectorized' value and pick the first lane.
1034   // Instcombine will make this a no-op.
1035   auto *InvarCond =
1036       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
1037 
1038   for (unsigned Part = 0; Part < State.UF; ++Part) {
1039     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
1040     Value *Op0 = State.get(getOperand(1), Part);
1041     Value *Op1 = State.get(getOperand(2), Part);
1042     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1043     State.set(this, Sel, Part);
1044     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1045   }
1046 }
1047 
1048 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1049     const FastMathFlags &FMF) {
1050   AllowReassoc = FMF.allowReassoc();
1051   NoNaNs = FMF.noNaNs();
1052   NoInfs = FMF.noInfs();
1053   NoSignedZeros = FMF.noSignedZeros();
1054   AllowReciprocal = FMF.allowReciprocal();
1055   AllowContract = FMF.allowContract();
1056   ApproxFunc = FMF.approxFunc();
1057 }
1058 
1059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1060 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1061   switch (OpType) {
1062   case OperationType::Cmp:
1063     O << " " << CmpInst::getPredicateName(getPredicate());
1064     break;
1065   case OperationType::DisjointOp:
1066     if (DisjointFlags.IsDisjoint)
1067       O << " disjoint";
1068     break;
1069   case OperationType::PossiblyExactOp:
1070     if (ExactFlags.IsExact)
1071       O << " exact";
1072     break;
1073   case OperationType::OverflowingBinOp:
1074     if (WrapFlags.HasNUW)
1075       O << " nuw";
1076     if (WrapFlags.HasNSW)
1077       O << " nsw";
1078     break;
1079   case OperationType::FPMathOp:
1080     getFastMathFlags().print(O);
1081     break;
1082   case OperationType::GEPOp:
1083     if (GEPFlags.IsInBounds)
1084       O << " inbounds";
1085     break;
1086   case OperationType::NonNegOp:
1087     if (NonNegFlags.NonNeg)
1088       O << " nneg";
1089     break;
1090   case OperationType::Other:
1091     break;
1092   }
1093   if (getNumOperands() > 0)
1094     O << " ";
1095 }
1096 #endif
1097 
1098 void VPWidenRecipe::execute(VPTransformState &State) {
1099   State.setDebugLocFrom(getDebugLoc());
1100   auto &Builder = State.Builder;
1101   switch (Opcode) {
1102   case Instruction::Call:
1103   case Instruction::Br:
1104   case Instruction::PHI:
1105   case Instruction::GetElementPtr:
1106   case Instruction::Select:
1107     llvm_unreachable("This instruction is handled by a different recipe.");
1108   case Instruction::UDiv:
1109   case Instruction::SDiv:
1110   case Instruction::SRem:
1111   case Instruction::URem:
1112   case Instruction::Add:
1113   case Instruction::FAdd:
1114   case Instruction::Sub:
1115   case Instruction::FSub:
1116   case Instruction::FNeg:
1117   case Instruction::Mul:
1118   case Instruction::FMul:
1119   case Instruction::FDiv:
1120   case Instruction::FRem:
1121   case Instruction::Shl:
1122   case Instruction::LShr:
1123   case Instruction::AShr:
1124   case Instruction::And:
1125   case Instruction::Or:
1126   case Instruction::Xor: {
1127     // Just widen unops and binops.
1128     for (unsigned Part = 0; Part < State.UF; ++Part) {
1129       SmallVector<Value *, 2> Ops;
1130       for (VPValue *VPOp : operands())
1131         Ops.push_back(State.get(VPOp, Part));
1132 
1133       Value *V = Builder.CreateNAryOp(Opcode, Ops);
1134 
1135       if (auto *VecOp = dyn_cast<Instruction>(V))
1136         setFlags(VecOp);
1137 
1138       // Use this vector value for all users of the original instruction.
1139       State.set(this, V, Part);
1140       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1141     }
1142 
1143     break;
1144   }
1145   case Instruction::Freeze: {
1146     for (unsigned Part = 0; Part < State.UF; ++Part) {
1147       Value *Op = State.get(getOperand(0), Part);
1148 
1149       Value *Freeze = Builder.CreateFreeze(Op);
1150       State.set(this, Freeze, Part);
1151     }
1152     break;
1153   }
1154   case Instruction::ICmp:
1155   case Instruction::FCmp: {
1156     // Widen compares. Generate vector compares.
1157     bool FCmp = Opcode == Instruction::FCmp;
1158     for (unsigned Part = 0; Part < State.UF; ++Part) {
1159       Value *A = State.get(getOperand(0), Part);
1160       Value *B = State.get(getOperand(1), Part);
1161       Value *C = nullptr;
1162       if (FCmp) {
1163         // Propagate fast math flags.
1164         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1165         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1166           Builder.setFastMathFlags(I->getFastMathFlags());
1167         C = Builder.CreateFCmp(getPredicate(), A, B);
1168       } else {
1169         C = Builder.CreateICmp(getPredicate(), A, B);
1170       }
1171       State.set(this, C, Part);
1172       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1173     }
1174 
1175     break;
1176   }
1177   default:
1178     // This instruction is not vectorized by simple widening.
1179     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1180                       << Instruction::getOpcodeName(Opcode));
1181     llvm_unreachable("Unhandled instruction!");
1182   } // end of switch.
1183 
1184 #if !defined(NDEBUG)
1185   // Verify that VPlan type inference results agree with the type of the
1186   // generated values.
1187   for (unsigned Part = 0; Part < State.UF; ++Part) {
1188     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1189                            State.VF) == State.get(this, Part)->getType() &&
1190            "inferred type and type from generated instructions do not match");
1191   }
1192 #endif
1193 }
1194 
1195 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1196                                            VPCostContext &Ctx) const {
1197   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1198   switch (Opcode) {
1199   case Instruction::FNeg: {
1200     Type *VectorTy =
1201         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1202     return Ctx.TTI.getArithmeticInstrCost(
1203         Opcode, VectorTy, CostKind,
1204         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1205         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1206   }
1207 
1208   case Instruction::UDiv:
1209   case Instruction::SDiv:
1210   case Instruction::SRem:
1211   case Instruction::URem:
1212     // More complex computation, let the legacy cost-model handle this for now.
1213     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1214   case Instruction::Add:
1215   case Instruction::FAdd:
1216   case Instruction::Sub:
1217   case Instruction::FSub:
1218   case Instruction::Mul:
1219   case Instruction::FMul:
1220   case Instruction::FDiv:
1221   case Instruction::FRem:
1222   case Instruction::Shl:
1223   case Instruction::LShr:
1224   case Instruction::AShr:
1225   case Instruction::And:
1226   case Instruction::Or:
1227   case Instruction::Xor: {
1228     VPValue *RHS = getOperand(1);
1229     // Certain instructions can be cheaper to vectorize if they have a constant
1230     // second vector operand. One example of this are shifts on x86.
1231     TargetTransformInfo::OperandValueInfo RHSInfo = {
1232         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1233     if (RHS->isLiveIn())
1234       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1235 
1236     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1237         getOperand(1)->isDefinedOutsideVectorRegions())
1238       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1239     Type *VectorTy =
1240         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1241     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1242 
1243     SmallVector<const Value *, 4> Operands;
1244     if (CtxI)
1245       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1246     return Ctx.TTI.getArithmeticInstrCost(
1247         Opcode, VectorTy, CostKind,
1248         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1249         RHSInfo, Operands, CtxI, &Ctx.TLI);
1250   }
1251   case Instruction::Freeze: {
1252     // This opcode is unknown. Assume that it is the same as 'mul'.
1253     Type *VectorTy =
1254         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1255     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1256   }
1257   case Instruction::ICmp:
1258   case Instruction::FCmp: {
1259     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1260     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1261     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1262                                       CostKind, CtxI);
1263   }
1264   default:
1265     llvm_unreachable("Unsupported opcode for instruction");
1266   }
1267 }
1268 
1269 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1270   unsigned Opcode = getOpcode();
1271   // TODO: Support other opcodes
1272   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1273     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1274 
1275   State.setDebugLocFrom(getDebugLoc());
1276   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1277                           "explicit vector length.");
1278 
1279   assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
1280          "VPWidenEVLRecipe should not be used for scalars");
1281 
1282   VPValue *EVL = getEVL();
1283   Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
1284   IRBuilderBase &BuilderIR = State.Builder;
1285   VectorBuilder Builder(BuilderIR);
1286   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1287 
1288   SmallVector<Value *, 4> Ops;
1289   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1290     VPValue *VPOp = getOperand(I);
1291     Ops.push_back(State.get(VPOp, 0));
1292   }
1293 
1294   Builder.setMask(Mask).setEVL(EVLArg);
1295   Value *VPInst =
1296       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1297   // Currently vp-intrinsics only accept FMF flags.
1298   // TODO: Enable other flags when support is added.
1299   if (isa<FPMathOperator>(VPInst))
1300     setFlags(cast<Instruction>(VPInst));
1301 
1302   State.set(this, VPInst, 0);
1303   State.addMetadata(VPInst,
1304                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1305 }
1306 
1307 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1308 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1309                           VPSlotTracker &SlotTracker) const {
1310   O << Indent << "WIDEN ";
1311   printAsOperand(O, SlotTracker);
1312   O << " = " << Instruction::getOpcodeName(Opcode);
1313   printFlags(O);
1314   printOperands(O, SlotTracker);
1315 }
1316 
1317 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1318                              VPSlotTracker &SlotTracker) const {
1319   O << Indent << "WIDEN-VP ";
1320   printAsOperand(O, SlotTracker);
1321   O << " = " << Instruction::getOpcodeName(getOpcode());
1322   printFlags(O);
1323   printOperands(O, SlotTracker);
1324 }
1325 #endif
1326 
1327 void VPWidenCastRecipe::execute(VPTransformState &State) {
1328   State.setDebugLocFrom(getDebugLoc());
1329   auto &Builder = State.Builder;
1330   /// Vectorize casts.
1331   assert(State.VF.isVector() && "Not vectorizing?");
1332   Type *DestTy = VectorType::get(getResultType(), State.VF);
1333   VPValue *Op = getOperand(0);
1334   for (unsigned Part = 0; Part < State.UF; ++Part) {
1335     if (Part > 0 && Op->isLiveIn()) {
1336       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1337       State.set(this, State.get(this, 0), Part);
1338       continue;
1339     }
1340     Value *A = State.get(Op, Part);
1341     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1342     State.set(this, Cast, Part);
1343     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1344   }
1345 }
1346 
1347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1348 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1349                               VPSlotTracker &SlotTracker) const {
1350   O << Indent << "WIDEN-CAST ";
1351   printAsOperand(O, SlotTracker);
1352   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1353   printFlags(O);
1354   printOperands(O, SlotTracker);
1355   O << " to " << *getResultType();
1356 }
1357 #endif
1358 
1359 /// This function adds
1360 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1361 /// to each vector element of Val. The sequence starts at StartIndex.
1362 /// \p Opcode is relevant for FP induction variable.
1363 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1364                             Instruction::BinaryOps BinOp, ElementCount VF,
1365                             IRBuilderBase &Builder) {
1366   assert(VF.isVector() && "only vector VFs are supported");
1367 
1368   // Create and check the types.
1369   auto *ValVTy = cast<VectorType>(Val->getType());
1370   ElementCount VLen = ValVTy->getElementCount();
1371 
1372   Type *STy = Val->getType()->getScalarType();
1373   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1374          "Induction Step must be an integer or FP");
1375   assert(Step->getType() == STy && "Step has wrong type");
1376 
1377   SmallVector<Constant *, 8> Indices;
1378 
1379   // Create a vector of consecutive numbers from zero to VF.
1380   VectorType *InitVecValVTy = ValVTy;
1381   if (STy->isFloatingPointTy()) {
1382     Type *InitVecValSTy =
1383         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1384     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1385   }
1386   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1387 
1388   // Splat the StartIdx
1389   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1390 
1391   if (STy->isIntegerTy()) {
1392     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1393     Step = Builder.CreateVectorSplat(VLen, Step);
1394     assert(Step->getType() == Val->getType() && "Invalid step vec");
1395     // FIXME: The newly created binary instructions should contain nsw/nuw
1396     // flags, which can be found from the original scalar operations.
1397     Step = Builder.CreateMul(InitVec, Step);
1398     return Builder.CreateAdd(Val, Step, "induction");
1399   }
1400 
1401   // Floating point induction.
1402   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1403          "Binary Opcode should be specified for FP induction");
1404   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1405   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1406 
1407   Step = Builder.CreateVectorSplat(VLen, Step);
1408   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1409   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1410 }
1411 
1412 /// A helper function that returns an integer or floating-point constant with
1413 /// value C.
1414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1415   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1416                            : ConstantFP::get(Ty, C);
1417 }
1418 
1419 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1420   assert(!State.Instance && "Int or FP induction being replicated.");
1421 
1422   Value *Start = getStartValue()->getLiveInIRValue();
1423   const InductionDescriptor &ID = getInductionDescriptor();
1424   TruncInst *Trunc = getTruncInst();
1425   IRBuilderBase &Builder = State.Builder;
1426   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1427   assert(State.VF.isVector() && "must have vector VF");
1428 
1429   // The value from the original loop to which we are mapping the new induction
1430   // variable.
1431   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1432 
1433   // Fast-math-flags propagate from the original induction instruction.
1434   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1435   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1436     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1437 
1438   // Now do the actual transformations, and start with fetching the step value.
1439   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1440 
1441   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1442          "Expected either an induction phi-node or a truncate of it!");
1443 
1444   // Construct the initial value of the vector IV in the vector loop preheader
1445   auto CurrIP = Builder.saveIP();
1446   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1447   Builder.SetInsertPoint(VectorPH->getTerminator());
1448   if (isa<TruncInst>(EntryVal)) {
1449     assert(Start->getType()->isIntegerTy() &&
1450            "Truncation requires an integer type");
1451     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1452     Step = Builder.CreateTrunc(Step, TruncType);
1453     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1454   }
1455 
1456   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1457   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1458   Value *SteppedStart = getStepVector(
1459       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1460 
1461   // We create vector phi nodes for both integer and floating-point induction
1462   // variables. Here, we determine the kind of arithmetic we will perform.
1463   Instruction::BinaryOps AddOp;
1464   Instruction::BinaryOps MulOp;
1465   if (Step->getType()->isIntegerTy()) {
1466     AddOp = Instruction::Add;
1467     MulOp = Instruction::Mul;
1468   } else {
1469     AddOp = ID.getInductionOpcode();
1470     MulOp = Instruction::FMul;
1471   }
1472 
1473   // Multiply the vectorization factor by the step using integer or
1474   // floating-point arithmetic as appropriate.
1475   Type *StepType = Step->getType();
1476   Value *RuntimeVF = State.get(getVFValue(), {0, 0});
1477   if (Step->getType()->isFloatingPointTy())
1478     RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1479   else
1480     RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1481   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1482 
1483   // Create a vector splat to use in the induction update.
1484   //
1485   // FIXME: If the step is non-constant, we create the vector splat with
1486   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1487   //        handle a constant vector splat.
1488   Value *SplatVF = isa<Constant>(Mul)
1489                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1490                        : Builder.CreateVectorSplat(State.VF, Mul);
1491   Builder.restoreIP(CurrIP);
1492 
1493   // We may need to add the step a number of times, depending on the unroll
1494   // factor. The last of those goes into the PHI.
1495   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1496   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1497   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1498   Instruction *LastInduction = VecInd;
1499   for (unsigned Part = 0; Part < State.UF; ++Part) {
1500     State.set(this, LastInduction, Part);
1501 
1502     if (isa<TruncInst>(EntryVal))
1503       State.addMetadata(LastInduction, EntryVal);
1504 
1505     LastInduction = cast<Instruction>(
1506         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1507     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1508   }
1509 
1510   LastInduction->setName("vec.ind.next");
1511   VecInd->addIncoming(SteppedStart, VectorPH);
1512   // Add induction update using an incorrect block temporarily. The phi node
1513   // will be fixed after VPlan execution. Note that at this point the latch
1514   // block cannot be used, as it does not exist yet.
1515   // TODO: Model increment value in VPlan, by turning the recipe into a
1516   // multi-def and a subclass of VPHeaderPHIRecipe.
1517   VecInd->addIncoming(LastInduction, VectorPH);
1518 }
1519 
1520 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1521 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1522                                           VPSlotTracker &SlotTracker) const {
1523   O << Indent << "WIDEN-INDUCTION";
1524   if (getTruncInst()) {
1525     O << "\\l\"";
1526     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1527     O << " +\n" << Indent << "\"  ";
1528     getVPValue(0)->printAsOperand(O, SlotTracker);
1529   } else
1530     O << " " << VPlanIngredient(IV);
1531 
1532   O << ", ";
1533   getStepValue()->printAsOperand(O, SlotTracker);
1534 
1535   O << ", ";
1536   getVFValue()->printAsOperand(O, SlotTracker);
1537 }
1538 #endif
1539 
1540 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1541   // The step may be defined by a recipe in the preheader (e.g. if it requires
1542   // SCEV expansion), but for the canonical induction the step is required to be
1543   // 1, which is represented as live-in.
1544   if (getStepValue()->getDefiningRecipe())
1545     return false;
1546   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1547   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1548   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1549   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1550          getScalarType() == CanIV->getScalarType();
1551 }
1552 
1553 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1554 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1555                               VPSlotTracker &SlotTracker) const {
1556   O << Indent;
1557   printAsOperand(O, SlotTracker);
1558   O << " = DERIVED-IV ";
1559   getStartValue()->printAsOperand(O, SlotTracker);
1560   O << " + ";
1561   getOperand(1)->printAsOperand(O, SlotTracker);
1562   O << " * ";
1563   getStepValue()->printAsOperand(O, SlotTracker);
1564 }
1565 #endif
1566 
1567 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1568   // Fast-math-flags propagate from the original induction instruction.
1569   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1570   if (hasFastMathFlags())
1571     State.Builder.setFastMathFlags(getFastMathFlags());
1572 
1573   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1574   /// variable on which to base the steps, \p Step is the size of the step.
1575 
1576   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1577   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1578   IRBuilderBase &Builder = State.Builder;
1579 
1580   // Ensure step has the same type as that of scalar IV.
1581   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1582   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1583 
1584   // We build scalar steps for both integer and floating-point induction
1585   // variables. Here, we determine the kind of arithmetic we will perform.
1586   Instruction::BinaryOps AddOp;
1587   Instruction::BinaryOps MulOp;
1588   if (BaseIVTy->isIntegerTy()) {
1589     AddOp = Instruction::Add;
1590     MulOp = Instruction::Mul;
1591   } else {
1592     AddOp = InductionOpcode;
1593     MulOp = Instruction::FMul;
1594   }
1595 
1596   // Determine the number of scalars we need to generate for each unroll
1597   // iteration.
1598   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1599   // Compute the scalar steps and save the results in State.
1600   Type *IntStepTy =
1601       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1602   Type *VecIVTy = nullptr;
1603   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1604   if (!FirstLaneOnly && State.VF.isScalable()) {
1605     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1606     UnitStepVec =
1607         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1608     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1609     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1610   }
1611 
1612   unsigned StartPart = 0;
1613   unsigned EndPart = State.UF;
1614   unsigned StartLane = 0;
1615   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1616   if (State.Instance) {
1617     StartPart = State.Instance->Part;
1618     EndPart = StartPart + 1;
1619     StartLane = State.Instance->Lane.getKnownLane();
1620     EndLane = StartLane + 1;
1621   }
1622   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1623     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1624 
1625     if (!FirstLaneOnly && State.VF.isScalable()) {
1626       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1627       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1628       if (BaseIVTy->isFloatingPointTy())
1629         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1630       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1631       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1632       State.set(this, Add, Part);
1633       // It's useful to record the lane values too for the known minimum number
1634       // of elements so we do those below. This improves the code quality when
1635       // trying to extract the first element, for example.
1636     }
1637 
1638     if (BaseIVTy->isFloatingPointTy())
1639       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1640 
1641     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1642       Value *StartIdx = Builder.CreateBinOp(
1643           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1644       // The step returned by `createStepForVF` is a runtime-evaluated value
1645       // when VF is scalable. Otherwise, it should be folded into a Constant.
1646       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1647              "Expected StartIdx to be folded to a constant when VF is not "
1648              "scalable");
1649       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1650       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1651       State.set(this, Add, VPIteration(Part, Lane));
1652     }
1653   }
1654 }
1655 
1656 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1658                                   VPSlotTracker &SlotTracker) const {
1659   O << Indent;
1660   printAsOperand(O, SlotTracker);
1661   O << " = SCALAR-STEPS ";
1662   printOperands(O, SlotTracker);
1663 }
1664 #endif
1665 
1666 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1667   assert(State.VF.isVector() && "not widening");
1668   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1669   // Construct a vector GEP by widening the operands of the scalar GEP as
1670   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1671   // results in a vector of pointers when at least one operand of the GEP
1672   // is vector-typed. Thus, to keep the representation compact, we only use
1673   // vector-typed operands for loop-varying values.
1674 
1675   if (areAllOperandsInvariant()) {
1676     // If we are vectorizing, but the GEP has only loop-invariant operands,
1677     // the GEP we build (by only using vector-typed operands for
1678     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1679     // produce a vector of pointers, we need to either arbitrarily pick an
1680     // operand to broadcast, or broadcast a clone of the original GEP.
1681     // Here, we broadcast a clone of the original.
1682     //
1683     // TODO: If at some point we decide to scalarize instructions having
1684     //       loop-invariant operands, this special case will no longer be
1685     //       required. We would add the scalarization decision to
1686     //       collectLoopScalars() and teach getVectorValue() to broadcast
1687     //       the lane-zero scalar value.
1688     SmallVector<Value *> Ops;
1689     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1690       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1691 
1692     auto *NewGEP =
1693         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1694                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1695     for (unsigned Part = 0; Part < State.UF; ++Part) {
1696       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1697       State.set(this, EntryPart, Part);
1698       State.addMetadata(EntryPart, GEP);
1699     }
1700   } else {
1701     // If the GEP has at least one loop-varying operand, we are sure to
1702     // produce a vector of pointers. But if we are only unrolling, we want
1703     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1704     // produce with the code below will be scalar (if VF == 1) or vector
1705     // (otherwise). Note that for the unroll-only case, we still maintain
1706     // values in the vector mapping with initVector, as we do for other
1707     // instructions.
1708     for (unsigned Part = 0; Part < State.UF; ++Part) {
1709       // The pointer operand of the new GEP. If it's loop-invariant, we
1710       // won't broadcast it.
1711       auto *Ptr = isPointerLoopInvariant()
1712                       ? State.get(getOperand(0), VPIteration(0, 0))
1713                       : State.get(getOperand(0), Part);
1714 
1715       // Collect all the indices for the new GEP. If any index is
1716       // loop-invariant, we won't broadcast it.
1717       SmallVector<Value *, 4> Indices;
1718       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1719         VPValue *Operand = getOperand(I);
1720         if (isIndexLoopInvariant(I - 1))
1721           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1722         else
1723           Indices.push_back(State.get(Operand, Part));
1724       }
1725 
1726       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1727       // but it should be a vector, otherwise.
1728       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1729                                              Indices, "", isInBounds());
1730       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1731              "NewGEP is not a pointer vector");
1732       State.set(this, NewGEP, Part);
1733       State.addMetadata(NewGEP, GEP);
1734     }
1735   }
1736 }
1737 
1738 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1739 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1740                              VPSlotTracker &SlotTracker) const {
1741   O << Indent << "WIDEN-GEP ";
1742   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1743   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1744     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1745 
1746   O << " ";
1747   printAsOperand(O, SlotTracker);
1748   O << " = getelementptr";
1749   printFlags(O);
1750   printOperands(O, SlotTracker);
1751 }
1752 #endif
1753 
1754 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1755   auto &Builder = State.Builder;
1756   State.setDebugLocFrom(getDebugLoc());
1757   for (unsigned Part = 0; Part < State.UF; ++Part) {
1758     // Calculate the pointer for the specific unroll-part.
1759     Value *PartPtr = nullptr;
1760     // Use i32 for the gep index type when the value is constant,
1761     // or query DataLayout for a more suitable index type otherwise.
1762     const DataLayout &DL =
1763         Builder.GetInsertBlock()->getDataLayout();
1764     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1765                         ? DL.getIndexType(IndexedTy->getPointerTo())
1766                         : Builder.getInt32Ty();
1767     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1768     bool InBounds = isInBounds();
1769     if (IsReverse) {
1770       // If the address is consecutive but reversed, then the
1771       // wide store needs to start at the last vector element.
1772       // RunTimeVF =  VScale * VF.getKnownMinValue()
1773       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1774       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1775       // NumElt = -Part * RunTimeVF
1776       Value *NumElt = Builder.CreateMul(
1777           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1778       // LastLane = 1 - RunTimeVF
1779       Value *LastLane =
1780           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1781       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1782       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1783     } else {
1784       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1785       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1786     }
1787 
1788     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1789   }
1790 }
1791 
1792 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1793 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1794                                   VPSlotTracker &SlotTracker) const {
1795   O << Indent;
1796   printAsOperand(O, SlotTracker);
1797   O << " = vector-pointer ";
1798   if (IsReverse)
1799     O << "(reverse) ";
1800 
1801   printOperands(O, SlotTracker);
1802 }
1803 #endif
1804 
1805 void VPBlendRecipe::execute(VPTransformState &State) {
1806   assert(isNormalized() && "Expected blend to be normalized!");
1807   State.setDebugLocFrom(getDebugLoc());
1808   // We know that all PHIs in non-header blocks are converted into
1809   // selects, so we don't have to worry about the insertion order and we
1810   // can just use the builder.
1811   // At this point we generate the predication tree. There may be
1812   // duplications since this is a simple recursive scan, but future
1813   // optimizations will clean it up.
1814 
1815   unsigned NumIncoming = getNumIncomingValues();
1816 
1817   // Generate a sequence of selects of the form:
1818   // SELECT(Mask3, In3,
1819   //        SELECT(Mask2, In2,
1820   //               SELECT(Mask1, In1,
1821   //                      In0)))
1822   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1823   // are essentially undef are taken from In0.
1824   VectorParts Entry(State.UF);
1825   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1826   for (unsigned In = 0; In < NumIncoming; ++In) {
1827     for (unsigned Part = 0; Part < State.UF; ++Part) {
1828       // We might have single edge PHIs (blocks) - use an identity
1829       // 'select' for the first PHI operand.
1830       Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1831       if (In == 0)
1832         Entry[Part] = In0; // Initialize with the first incoming value.
1833       else {
1834         // Select between the current value and the previous incoming edge
1835         // based on the incoming mask.
1836         Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1837         Entry[Part] =
1838             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1839       }
1840     }
1841   }
1842 
1843   for (unsigned Part = 0; Part < State.UF; ++Part)
1844     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1845 }
1846 
1847 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1848 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1849                           VPSlotTracker &SlotTracker) const {
1850   O << Indent << "BLEND ";
1851   printAsOperand(O, SlotTracker);
1852   O << " =";
1853   if (getNumIncomingValues() == 1) {
1854     // Not a User of any mask: not really blending, this is a
1855     // single-predecessor phi.
1856     O << " ";
1857     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1858   } else {
1859     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1860       O << " ";
1861       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1862       if (I == 0)
1863         continue;
1864       O << "/";
1865       getMask(I)->printAsOperand(O, SlotTracker);
1866     }
1867   }
1868 }
1869 #endif
1870 
1871 void VPReductionRecipe::execute(VPTransformState &State) {
1872   assert(!State.Instance && "Reduction being replicated.");
1873   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1874   RecurKind Kind = RdxDesc.getRecurrenceKind();
1875   // Propagate the fast-math flags carried by the underlying instruction.
1876   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1877   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1878   for (unsigned Part = 0; Part < State.UF; ++Part) {
1879     Value *NewVecOp = State.get(getVecOp(), Part);
1880     if (VPValue *Cond = getCondOp()) {
1881       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1882       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1883       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1884 
1885       Value *Start;
1886       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1887         Start = RdxDesc.getRecurrenceStartValue();
1888       else
1889         Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1890                                             RdxDesc.getFastMathFlags());
1891       if (State.VF.isVector())
1892         Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(),
1893                                                 Start);
1894 
1895       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1896       NewVecOp = Select;
1897     }
1898     Value *NewRed;
1899     Value *NextInChain;
1900     if (IsOrdered) {
1901       if (State.VF.isVector())
1902         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1903                                         PrevInChain);
1904       else
1905         NewRed = State.Builder.CreateBinOp(
1906             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1907             NewVecOp);
1908       PrevInChain = NewRed;
1909       NextInChain = NewRed;
1910     } else {
1911       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1912       NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1913       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1914         NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1915                                      NewRed, PrevInChain);
1916       else
1917         NextInChain = State.Builder.CreateBinOp(
1918             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
1919             PrevInChain);
1920     }
1921     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1922   }
1923 }
1924 
1925 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1926   assert(!State.Instance && "Reduction being replicated.");
1927   assert(State.UF == 1 &&
1928          "Expected only UF == 1 when vectorizing with explicit vector length.");
1929 
1930   auto &Builder = State.Builder;
1931   // Propagate the fast-math flags carried by the underlying instruction.
1932   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1933   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1934   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1935 
1936   RecurKind Kind = RdxDesc.getRecurrenceKind();
1937   Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1938   Value *VecOp = State.get(getVecOp(), 0);
1939   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1940 
1941   VectorBuilder VBuilder(Builder);
1942   VBuilder.setEVL(EVL);
1943   Value *Mask;
1944   // TODO: move the all-true mask generation into VectorBuilder.
1945   if (VPValue *CondOp = getCondOp())
1946     Mask = State.get(CondOp, 0);
1947   else
1948     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1949   VBuilder.setMask(Mask);
1950 
1951   Value *NewRed;
1952   if (isOrdered()) {
1953     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1954   } else {
1955     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
1956     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1957       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1958     else
1959       NewRed = Builder.CreateBinOp(
1960           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1961   }
1962   State.set(this, NewRed, 0, /*IsScalar*/ true);
1963 }
1964 
1965 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1966 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1967                               VPSlotTracker &SlotTracker) const {
1968   O << Indent << "REDUCE ";
1969   printAsOperand(O, SlotTracker);
1970   O << " = ";
1971   getChainOp()->printAsOperand(O, SlotTracker);
1972   O << " +";
1973   if (isa<FPMathOperator>(getUnderlyingInstr()))
1974     O << getUnderlyingInstr()->getFastMathFlags();
1975   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1976   getVecOp()->printAsOperand(O, SlotTracker);
1977   if (isConditional()) {
1978     O << ", ";
1979     getCondOp()->printAsOperand(O, SlotTracker);
1980   }
1981   O << ")";
1982   if (RdxDesc.IntermediateStore)
1983     O << " (with final reduction value stored in invariant address sank "
1984          "outside of loop)";
1985 }
1986 
1987 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1988                                  VPSlotTracker &SlotTracker) const {
1989   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1990   O << Indent << "REDUCE ";
1991   printAsOperand(O, SlotTracker);
1992   O << " = ";
1993   getChainOp()->printAsOperand(O, SlotTracker);
1994   O << " +";
1995   if (isa<FPMathOperator>(getUnderlyingInstr()))
1996     O << getUnderlyingInstr()->getFastMathFlags();
1997   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1998   getVecOp()->printAsOperand(O, SlotTracker);
1999   O << ", ";
2000   getEVL()->printAsOperand(O, SlotTracker);
2001   if (isConditional()) {
2002     O << ", ";
2003     getCondOp()->printAsOperand(O, SlotTracker);
2004   }
2005   O << ")";
2006   if (RdxDesc.IntermediateStore)
2007     O << " (with final reduction value stored in invariant address sank "
2008          "outside of loop)";
2009 }
2010 #endif
2011 
2012 bool VPReplicateRecipe::shouldPack() const {
2013   // Find if the recipe is used by a widened recipe via an intervening
2014   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2015   return any_of(users(), [](const VPUser *U) {
2016     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2017       return any_of(PredR->users(), [PredR](const VPUser *U) {
2018         return !U->usesScalars(PredR);
2019       });
2020     return false;
2021   });
2022 }
2023 
2024 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2025 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2026                               VPSlotTracker &SlotTracker) const {
2027   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2028 
2029   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2030     printAsOperand(O, SlotTracker);
2031     O << " = ";
2032   }
2033   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2034     O << "call";
2035     printFlags(O);
2036     O << "@" << CB->getCalledFunction()->getName() << "(";
2037     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2038                     O, [&O, &SlotTracker](VPValue *Op) {
2039                       Op->printAsOperand(O, SlotTracker);
2040                     });
2041     O << ")";
2042   } else {
2043     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2044     printFlags(O);
2045     printOperands(O, SlotTracker);
2046   }
2047 
2048   if (shouldPack())
2049     O << " (S->V)";
2050 }
2051 #endif
2052 
2053 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
2054 /// if it is either defined outside the vector region or its operand is known to
2055 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
2056 /// TODO: Uniformity should be associated with a VPValue and there should be a
2057 /// generic way to check.
2058 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
2059   return C->isDefinedOutsideVectorRegions() ||
2060          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
2061          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
2062 }
2063 
2064 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
2065   assert(vputils::onlyFirstLaneUsed(this) &&
2066          "Codegen only implemented for first lane.");
2067   switch (Opcode) {
2068   case Instruction::SExt:
2069   case Instruction::ZExt:
2070   case Instruction::Trunc: {
2071     // Note: SExt/ZExt not used yet.
2072     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
2073     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2074   }
2075   default:
2076     llvm_unreachable("opcode not implemented yet");
2077   }
2078 }
2079 
2080 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2081   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
2082   for (unsigned Part = 0; Part != State.UF; ++Part) {
2083     Value *Res;
2084     // Only generate a single instance, if the recipe is uniform across UFs and
2085     // VFs.
2086     if (Part > 0 && IsUniformAcrossVFsAndUFs)
2087       Res = State.get(this, VPIteration(0, 0));
2088     else
2089       Res = generate(State, Part);
2090     State.set(this, Res, VPIteration(Part, 0));
2091   }
2092 }
2093 
2094 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2095 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2096                                 VPSlotTracker &SlotTracker) const {
2097   O << Indent << "SCALAR-CAST ";
2098   printAsOperand(O, SlotTracker);
2099   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2100   printOperands(O, SlotTracker);
2101   O << " to " << *ResultTy;
2102 }
2103 #endif
2104 
2105 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2106   assert(State.Instance && "Branch on Mask works only on single instance.");
2107 
2108   unsigned Part = State.Instance->Part;
2109   unsigned Lane = State.Instance->Lane.getKnownLane();
2110 
2111   Value *ConditionBit = nullptr;
2112   VPValue *BlockInMask = getMask();
2113   if (BlockInMask) {
2114     ConditionBit = State.get(BlockInMask, Part);
2115     if (ConditionBit->getType()->isVectorTy())
2116       ConditionBit = State.Builder.CreateExtractElement(
2117           ConditionBit, State.Builder.getInt32(Lane));
2118   } else // Block in mask is all-one.
2119     ConditionBit = State.Builder.getTrue();
2120 
2121   // Replace the temporary unreachable terminator with a new conditional branch,
2122   // whose two destinations will be set later when they are created.
2123   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2124   assert(isa<UnreachableInst>(CurrentTerminator) &&
2125          "Expected to replace unreachable terminator with conditional branch.");
2126   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2127   CondBr->setSuccessor(0, nullptr);
2128   ReplaceInstWithInst(CurrentTerminator, CondBr);
2129 }
2130 
2131 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2132   assert(State.Instance && "Predicated instruction PHI works per instance.");
2133   Instruction *ScalarPredInst =
2134       cast<Instruction>(State.get(getOperand(0), *State.Instance));
2135   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2136   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2137   assert(PredicatingBB && "Predicated block has no single predecessor.");
2138   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2139          "operand must be VPReplicateRecipe");
2140 
2141   // By current pack/unpack logic we need to generate only a single phi node: if
2142   // a vector value for the predicated instruction exists at this point it means
2143   // the instruction has vector users only, and a phi for the vector value is
2144   // needed. In this case the recipe of the predicated instruction is marked to
2145   // also do that packing, thereby "hoisting" the insert-element sequence.
2146   // Otherwise, a phi node for the scalar value is needed.
2147   unsigned Part = State.Instance->Part;
2148   if (State.hasVectorValue(getOperand(0), Part)) {
2149     Value *VectorValue = State.get(getOperand(0), Part);
2150     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2151     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2152     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2153     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2154     if (State.hasVectorValue(this, Part))
2155       State.reset(this, VPhi, Part);
2156     else
2157       State.set(this, VPhi, Part);
2158     // NOTE: Currently we need to update the value of the operand, so the next
2159     // predicated iteration inserts its generated value in the correct vector.
2160     State.reset(getOperand(0), VPhi, Part);
2161   } else {
2162     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2163     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2164     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2165                      PredicatingBB);
2166     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2167     if (State.hasScalarValue(this, *State.Instance))
2168       State.reset(this, Phi, *State.Instance);
2169     else
2170       State.set(this, Phi, *State.Instance);
2171     // NOTE: Currently we need to update the value of the operand, so the next
2172     // predicated iteration inserts its generated value in the correct vector.
2173     State.reset(getOperand(0), Phi, *State.Instance);
2174   }
2175 }
2176 
2177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2178 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2179                                 VPSlotTracker &SlotTracker) const {
2180   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2181   printAsOperand(O, SlotTracker);
2182   O << " = ";
2183   printOperands(O, SlotTracker);
2184 }
2185 #endif
2186 
2187 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2188                                                  VPCostContext &Ctx) const {
2189   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2190   const Align Alignment =
2191       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2192   unsigned AS =
2193       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2194   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2195 
2196   if (!Consecutive) {
2197     // TODO: Using the original IR may not be accurate.
2198     // Currently, ARM will use the underlying IR to calculate gather/scatter
2199     // instruction cost.
2200     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2201     assert(!Reverse &&
2202            "Inconsecutive memory access should not have the order.");
2203     return Ctx.TTI.getAddressComputationCost(Ty) +
2204            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2205                                           IsMasked, Alignment, CostKind,
2206                                           &Ingredient);
2207   }
2208 
2209   InstructionCost Cost = 0;
2210   if (IsMasked) {
2211     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2212                                           AS, CostKind);
2213   } else {
2214     TTI::OperandValueInfo OpInfo =
2215         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2216     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2217                                     CostKind, OpInfo, &Ingredient);
2218   }
2219   if (!Reverse)
2220     return Cost;
2221 
2222   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2223                                         cast<VectorType>(Ty), std::nullopt,
2224                                         CostKind, 0);
2225 }
2226 
2227 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2228   auto *LI = cast<LoadInst>(&Ingredient);
2229 
2230   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2231   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2232   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2233   bool CreateGather = !isConsecutive();
2234 
2235   auto &Builder = State.Builder;
2236   State.setDebugLocFrom(getDebugLoc());
2237   for (unsigned Part = 0; Part < State.UF; ++Part) {
2238     Value *NewLI;
2239     Value *Mask = nullptr;
2240     if (auto *VPMask = getMask()) {
2241       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2242       // of a null all-one mask is a null mask.
2243       Mask = State.get(VPMask, Part);
2244       if (isReverse())
2245         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2246     }
2247 
2248     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
2249     if (CreateGather) {
2250       NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2251                                          "wide.masked.gather");
2252     } else if (Mask) {
2253       NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2254                                        PoisonValue::get(DataTy),
2255                                        "wide.masked.load");
2256     } else {
2257       NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2258     }
2259     // Add metadata to the load, but setVectorValue to the reverse shuffle.
2260     State.addMetadata(NewLI, LI);
2261     if (Reverse)
2262       NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2263     State.set(this, NewLI, Part);
2264   }
2265 }
2266 
2267 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2268 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2269                               VPSlotTracker &SlotTracker) const {
2270   O << Indent << "WIDEN ";
2271   printAsOperand(O, SlotTracker);
2272   O << " = load ";
2273   printOperands(O, SlotTracker);
2274 }
2275 #endif
2276 
2277 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2278 /// dependence w/o affecting the result.
2279 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2280                                      Value *EVL, const Twine &Name) {
2281   VectorType *ValTy = cast<VectorType>(Operand->getType());
2282   Value *AllTrueMask =
2283       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2284   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2285                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2286 }
2287 
2288 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2289   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2290                           "explicit vector length.");
2291   auto *LI = cast<LoadInst>(&Ingredient);
2292 
2293   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2294   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2295   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2296   bool CreateGather = !isConsecutive();
2297 
2298   auto &Builder = State.Builder;
2299   State.setDebugLocFrom(getDebugLoc());
2300   CallInst *NewLI;
2301   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2302   Value *Addr = State.get(getAddr(), 0, !CreateGather);
2303   Value *Mask = nullptr;
2304   if (VPValue *VPMask = getMask()) {
2305     Mask = State.get(VPMask, 0);
2306     if (isReverse())
2307       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2308   } else {
2309     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2310   }
2311 
2312   if (CreateGather) {
2313     NewLI =
2314         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2315                                 nullptr, "wide.masked.gather");
2316   } else {
2317     VectorBuilder VBuilder(Builder);
2318     VBuilder.setEVL(EVL).setMask(Mask);
2319     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2320         Instruction::Load, DataTy, Addr, "vp.op.load"));
2321   }
2322   NewLI->addParamAttr(
2323       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2324   State.addMetadata(NewLI, LI);
2325   Instruction *Res = NewLI;
2326   if (isReverse())
2327     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2328   State.set(this, Res, 0);
2329 }
2330 
2331 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2332 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2333                                  VPSlotTracker &SlotTracker) const {
2334   O << Indent << "WIDEN ";
2335   printAsOperand(O, SlotTracker);
2336   O << " = vp.load ";
2337   printOperands(O, SlotTracker);
2338 }
2339 #endif
2340 
2341 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2342   auto *SI = cast<StoreInst>(&Ingredient);
2343 
2344   VPValue *StoredVPValue = getStoredValue();
2345   bool CreateScatter = !isConsecutive();
2346   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2347 
2348   auto &Builder = State.Builder;
2349   State.setDebugLocFrom(getDebugLoc());
2350 
2351   for (unsigned Part = 0; Part < State.UF; ++Part) {
2352     Instruction *NewSI = nullptr;
2353     Value *Mask = nullptr;
2354     if (auto *VPMask = getMask()) {
2355       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2356       // of a null all-one mask is a null mask.
2357       Mask = State.get(VPMask, Part);
2358       if (isReverse())
2359         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2360     }
2361 
2362     Value *StoredVal = State.get(StoredVPValue, Part);
2363     if (isReverse()) {
2364       // If we store to reverse consecutive memory locations, then we need
2365       // to reverse the order of elements in the stored value.
2366       StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2367       // We don't want to update the value in the map as it might be used in
2368       // another expression. So don't call resetVectorValue(StoredVal).
2369     }
2370     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
2371     if (CreateScatter)
2372       NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2373     else if (Mask)
2374       NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2375     else
2376       NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2377     State.addMetadata(NewSI, SI);
2378   }
2379 }
2380 
2381 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2382 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2383                                VPSlotTracker &SlotTracker) const {
2384   O << Indent << "WIDEN store ";
2385   printOperands(O, SlotTracker);
2386 }
2387 #endif
2388 
2389 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2390   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2391                           "explicit vector length.");
2392   auto *SI = cast<StoreInst>(&Ingredient);
2393 
2394   VPValue *StoredValue = getStoredValue();
2395   bool CreateScatter = !isConsecutive();
2396   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2397 
2398   auto &Builder = State.Builder;
2399   State.setDebugLocFrom(getDebugLoc());
2400 
2401   CallInst *NewSI = nullptr;
2402   Value *StoredVal = State.get(StoredValue, 0);
2403   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2404   if (isReverse())
2405     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2406   Value *Mask = nullptr;
2407   if (VPValue *VPMask = getMask()) {
2408     Mask = State.get(VPMask, 0);
2409     if (isReverse())
2410       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2411   } else {
2412     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2413   }
2414   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
2415   if (CreateScatter) {
2416     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2417                                     Intrinsic::vp_scatter,
2418                                     {StoredVal, Addr, Mask, EVL});
2419   } else {
2420     VectorBuilder VBuilder(Builder);
2421     VBuilder.setEVL(EVL).setMask(Mask);
2422     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2423         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2424         {StoredVal, Addr}));
2425   }
2426   NewSI->addParamAttr(
2427       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2428   State.addMetadata(NewSI, SI);
2429 }
2430 
2431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2432 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2433                                   VPSlotTracker &SlotTracker) const {
2434   O << Indent << "WIDEN vp.store ";
2435   printOperands(O, SlotTracker);
2436 }
2437 #endif
2438 
2439 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2440                                      VectorType *DstVTy, const DataLayout &DL) {
2441   // Verify that V is a vector type with same number of elements as DstVTy.
2442   auto VF = DstVTy->getElementCount();
2443   auto *SrcVecTy = cast<VectorType>(V->getType());
2444   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2445   Type *SrcElemTy = SrcVecTy->getElementType();
2446   Type *DstElemTy = DstVTy->getElementType();
2447   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2448          "Vector elements must have same size");
2449 
2450   // Do a direct cast if element types are castable.
2451   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2452     return Builder.CreateBitOrPointerCast(V, DstVTy);
2453   }
2454   // V cannot be directly casted to desired vector type.
2455   // May happen when V is a floating point vector but DstVTy is a vector of
2456   // pointers or vice-versa. Handle this using a two-step bitcast using an
2457   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2458   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2459          "Only one type should be a pointer type");
2460   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2461          "Only one type should be a floating point type");
2462   Type *IntTy =
2463       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2464   auto *VecIntTy = VectorType::get(IntTy, VF);
2465   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2466   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2467 }
2468 
2469 /// Return a vector containing interleaved elements from multiple
2470 /// smaller input vectors.
2471 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2472                                 const Twine &Name) {
2473   unsigned Factor = Vals.size();
2474   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2475 
2476   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2477 #ifndef NDEBUG
2478   for (Value *Val : Vals)
2479     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2480 #endif
2481 
2482   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2483   // must use intrinsics to interleave.
2484   if (VecTy->isScalableTy()) {
2485     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2486     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2487                                    Vals,
2488                                    /*FMFSource=*/nullptr, Name);
2489   }
2490 
2491   // Fixed length. Start by concatenating all vectors into a wide vector.
2492   Value *WideVec = concatenateVectors(Builder, Vals);
2493 
2494   // Interleave the elements into the wide vector.
2495   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2496   return Builder.CreateShuffleVector(
2497       WideVec, createInterleaveMask(NumElts, Factor), Name);
2498 }
2499 
2500 // Try to vectorize the interleave group that \p Instr belongs to.
2501 //
2502 // E.g. Translate following interleaved load group (factor = 3):
2503 //   for (i = 0; i < N; i+=3) {
2504 //     R = Pic[i];             // Member of index 0
2505 //     G = Pic[i+1];           // Member of index 1
2506 //     B = Pic[i+2];           // Member of index 2
2507 //     ... // do something to R, G, B
2508 //   }
2509 // To:
2510 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2511 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2512 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2513 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2514 //
2515 // Or translate following interleaved store group (factor = 3):
2516 //   for (i = 0; i < N; i+=3) {
2517 //     ... do something to R, G, B
2518 //     Pic[i]   = R;           // Member of index 0
2519 //     Pic[i+1] = G;           // Member of index 1
2520 //     Pic[i+2] = B;           // Member of index 2
2521 //   }
2522 // To:
2523 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2524 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2525 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2526 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2527 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2528 void VPInterleaveRecipe::execute(VPTransformState &State) {
2529   assert(!State.Instance && "Interleave group being replicated.");
2530   const InterleaveGroup<Instruction> *Group = IG;
2531   Instruction *Instr = Group->getInsertPos();
2532 
2533   // Prepare for the vector type of the interleaved load/store.
2534   Type *ScalarTy = getLoadStoreType(Instr);
2535   unsigned InterleaveFactor = Group->getFactor();
2536   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2537 
2538   // Prepare for the new pointers.
2539   SmallVector<Value *, 2> AddrParts;
2540   unsigned Index = Group->getIndex(Instr);
2541 
2542   // TODO: extend the masked interleaved-group support to reversed access.
2543   VPValue *BlockInMask = getMask();
2544   assert((!BlockInMask || !Group->isReverse()) &&
2545          "Reversed masked interleave-group not supported.");
2546 
2547   Value *Idx;
2548   // If the group is reverse, adjust the index to refer to the last vector lane
2549   // instead of the first. We adjust the index from the first vector lane,
2550   // rather than directly getting the pointer for lane VF - 1, because the
2551   // pointer operand of the interleaved access is supposed to be uniform. For
2552   // uniform instructions, we're only required to generate a value for the
2553   // first vector lane in each unroll iteration.
2554   if (Group->isReverse()) {
2555     Value *RuntimeVF =
2556         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2557     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2558     Idx = State.Builder.CreateMul(Idx,
2559                                   State.Builder.getInt32(Group->getFactor()));
2560     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2561     Idx = State.Builder.CreateNeg(Idx);
2562   } else
2563     Idx = State.Builder.getInt32(-Index);
2564 
2565   VPValue *Addr = getAddr();
2566   for (unsigned Part = 0; Part < State.UF; Part++) {
2567     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2568     if (auto *I = dyn_cast<Instruction>(AddrPart))
2569       State.setDebugLocFrom(I->getDebugLoc());
2570 
2571     // Notice current instruction could be any index. Need to adjust the address
2572     // to the member of index 0.
2573     //
2574     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2575     //       b = A[i];       // Member of index 0
2576     // Current pointer is pointed to A[i+1], adjust it to A[i].
2577     //
2578     // E.g.  A[i+1] = a;     // Member of index 1
2579     //       A[i]   = b;     // Member of index 0
2580     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2581     // Current pointer is pointed to A[i+2], adjust it to A[i].
2582 
2583     bool InBounds = false;
2584     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2585       InBounds = gep->isInBounds();
2586     AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2587     AddrParts.push_back(AddrPart);
2588   }
2589 
2590   State.setDebugLocFrom(Instr->getDebugLoc());
2591   Value *PoisonVec = PoisonValue::get(VecTy);
2592 
2593   auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2594                              unsigned Part, Value *MaskForGaps) -> Value * {
2595     if (State.VF.isScalable()) {
2596       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2597       assert(InterleaveFactor == 2 &&
2598              "Unsupported deinterleave factor for scalable vectors");
2599       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2600       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2601       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2602                                      State.VF.getKnownMinValue() * 2, true);
2603       return State.Builder.CreateIntrinsic(
2604           MaskTy, Intrinsic::vector_interleave2, Ops,
2605           /*FMFSource=*/nullptr, "interleaved.mask");
2606     }
2607 
2608     if (!BlockInMask)
2609       return MaskForGaps;
2610 
2611     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2612     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2613         BlockInMaskPart,
2614         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2615         "interleaved.mask");
2616     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2617                                                    ShuffledMask, MaskForGaps)
2618                        : ShuffledMask;
2619   };
2620 
2621   const DataLayout &DL = Instr->getDataLayout();
2622   // Vectorize the interleaved load group.
2623   if (isa<LoadInst>(Instr)) {
2624     Value *MaskForGaps = nullptr;
2625     if (NeedsMaskForGaps) {
2626       MaskForGaps = createBitMaskForGaps(State.Builder,
2627                                          State.VF.getKnownMinValue(), *Group);
2628       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2629     }
2630 
2631     // For each unroll part, create a wide load for the group.
2632     SmallVector<Value *, 2> NewLoads;
2633     for (unsigned Part = 0; Part < State.UF; Part++) {
2634       Instruction *NewLoad;
2635       if (BlockInMask || MaskForGaps) {
2636         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2637         NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2638                                                  Group->getAlign(), GroupMask,
2639                                                  PoisonVec, "wide.masked.vec");
2640       } else
2641         NewLoad = State.Builder.CreateAlignedLoad(
2642             VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2643       Group->addMetadata(NewLoad);
2644       NewLoads.push_back(NewLoad);
2645     }
2646 
2647     ArrayRef<VPValue *> VPDefs = definedValues();
2648     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2649     if (VecTy->isScalableTy()) {
2650       assert(InterleaveFactor == 2 &&
2651              "Unsupported deinterleave factor for scalable vectors");
2652 
2653       for (unsigned Part = 0; Part < State.UF; ++Part) {
2654         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2655         // so must use intrinsics to deinterleave.
2656         Value *DI = State.Builder.CreateIntrinsic(
2657             Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2658             /*FMFSource=*/nullptr, "strided.vec");
2659         unsigned J = 0;
2660         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2661           Instruction *Member = Group->getMember(I);
2662 
2663           if (!Member)
2664             continue;
2665 
2666           Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2667           // If this member has different type, cast the result type.
2668           if (Member->getType() != ScalarTy) {
2669             VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2670             StridedVec =
2671                 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2672           }
2673 
2674           if (Group->isReverse())
2675             StridedVec =
2676                 State.Builder.CreateVectorReverse(StridedVec, "reverse");
2677 
2678           State.set(VPDefs[J], StridedVec, Part);
2679           ++J;
2680         }
2681       }
2682 
2683       return;
2684     }
2685 
2686     // For each member in the group, shuffle out the appropriate data from the
2687     // wide loads.
2688     unsigned J = 0;
2689     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2690       Instruction *Member = Group->getMember(I);
2691 
2692       // Skip the gaps in the group.
2693       if (!Member)
2694         continue;
2695 
2696       auto StrideMask =
2697           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2698       for (unsigned Part = 0; Part < State.UF; Part++) {
2699         Value *StridedVec = State.Builder.CreateShuffleVector(
2700             NewLoads[Part], StrideMask, "strided.vec");
2701 
2702         // If this member has different type, cast the result type.
2703         if (Member->getType() != ScalarTy) {
2704           assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2705           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2706           StridedVec =
2707               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2708         }
2709 
2710         if (Group->isReverse())
2711           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2712 
2713         State.set(VPDefs[J], StridedVec, Part);
2714       }
2715       ++J;
2716     }
2717     return;
2718   }
2719 
2720   // The sub vector type for current instruction.
2721   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2722 
2723   // Vectorize the interleaved store group.
2724   Value *MaskForGaps =
2725       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2726   assert((!MaskForGaps || !State.VF.isScalable()) &&
2727          "masking gaps for scalable vectors is not yet supported.");
2728   ArrayRef<VPValue *> StoredValues = getStoredValues();
2729   for (unsigned Part = 0; Part < State.UF; Part++) {
2730     // Collect the stored vector from each member.
2731     SmallVector<Value *, 4> StoredVecs;
2732     unsigned StoredIdx = 0;
2733     for (unsigned i = 0; i < InterleaveFactor; i++) {
2734       assert((Group->getMember(i) || MaskForGaps) &&
2735              "Fail to get a member from an interleaved store group");
2736       Instruction *Member = Group->getMember(i);
2737 
2738       // Skip the gaps in the group.
2739       if (!Member) {
2740         Value *Undef = PoisonValue::get(SubVT);
2741         StoredVecs.push_back(Undef);
2742         continue;
2743       }
2744 
2745       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2746       ++StoredIdx;
2747 
2748       if (Group->isReverse())
2749         StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2750 
2751       // If this member has different type, cast it to a unified type.
2752 
2753       if (StoredVec->getType() != SubVT)
2754         StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2755 
2756       StoredVecs.push_back(StoredVec);
2757     }
2758 
2759     // Interleave all the smaller vectors into one wider vector.
2760     Value *IVec =
2761         interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2762     Instruction *NewStoreInstr;
2763     if (BlockInMask || MaskForGaps) {
2764       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2765       NewStoreInstr = State.Builder.CreateMaskedStore(
2766           IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2767     } else
2768       NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2769                                                        Group->getAlign());
2770 
2771     Group->addMetadata(NewStoreInstr);
2772   }
2773 }
2774 
2775 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2776 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2777                                VPSlotTracker &SlotTracker) const {
2778   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2779   IG->getInsertPos()->printAsOperand(O, false);
2780   O << ", ";
2781   getAddr()->printAsOperand(O, SlotTracker);
2782   VPValue *Mask = getMask();
2783   if (Mask) {
2784     O << ", ";
2785     Mask->printAsOperand(O, SlotTracker);
2786   }
2787 
2788   unsigned OpIdx = 0;
2789   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2790     if (!IG->getMember(i))
2791       continue;
2792     if (getNumStoreOperands() > 0) {
2793       O << "\n" << Indent << "  store ";
2794       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2795       O << " to index " << i;
2796     } else {
2797       O << "\n" << Indent << "  ";
2798       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2799       O << " = load from index " << i;
2800     }
2801     ++OpIdx;
2802   }
2803 }
2804 #endif
2805 
2806 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2807   Value *Start = getStartValue()->getLiveInIRValue();
2808   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2809   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2810 
2811   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2812   EntryPart->addIncoming(Start, VectorPH);
2813   EntryPart->setDebugLoc(getDebugLoc());
2814   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2815     State.set(this, EntryPart, Part, /*IsScalar*/ true);
2816 }
2817 
2818 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2819 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2820                                    VPSlotTracker &SlotTracker) const {
2821   O << Indent << "EMIT ";
2822   printAsOperand(O, SlotTracker);
2823   O << " = CANONICAL-INDUCTION ";
2824   printOperands(O, SlotTracker);
2825 }
2826 #endif
2827 
2828 bool VPCanonicalIVPHIRecipe::isCanonical(
2829     InductionDescriptor::InductionKind Kind, VPValue *Start,
2830     VPValue *Step) const {
2831   // Must be an integer induction.
2832   if (Kind != InductionDescriptor::IK_IntInduction)
2833     return false;
2834   // Start must match the start value of this canonical induction.
2835   if (Start != getStartValue())
2836     return false;
2837 
2838   // If the step is defined by a recipe, it is not a ConstantInt.
2839   if (Step->getDefiningRecipe())
2840     return false;
2841 
2842   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2843   return StepC && StepC->isOne();
2844 }
2845 
2846 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2847   return IsScalarAfterVectorization &&
2848          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2849 }
2850 
2851 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2852   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2853          "Not a pointer induction according to InductionDescriptor!");
2854   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2855          "Unexpected type.");
2856   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2857          "Recipe should have been replaced");
2858 
2859   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2860   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
2861   Type *PhiType = IndDesc.getStep()->getType();
2862 
2863   // Build a pointer phi
2864   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2865   Type *ScStValueType = ScalarStartValue->getType();
2866   PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2867                                            CanonicalIV->getIterator());
2868 
2869   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2870   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2871 
2872   // A pointer induction, performed by using a gep
2873   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2874 
2875   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
2876   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2877   Value *NumUnrolledElems =
2878       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
2879   Value *InductionGEP = GetElementPtrInst::Create(
2880       State.Builder.getInt8Ty(), NewPointerPhi,
2881       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2882       InductionLoc);
2883   // Add induction update using an incorrect block temporarily. The phi node
2884   // will be fixed after VPlan execution. Note that at this point the latch
2885   // block cannot be used, as it does not exist yet.
2886   // TODO: Model increment value in VPlan, by turning the recipe into a
2887   // multi-def and a subclass of VPHeaderPHIRecipe.
2888   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2889 
2890   // Create UF many actual address geps that use the pointer
2891   // phi as base and a vectorized version of the step value
2892   // (<step*0, ..., step*N>) as offset.
2893   for (unsigned Part = 0; Part < State.UF; ++Part) {
2894     Type *VecPhiType = VectorType::get(PhiType, State.VF);
2895     Value *StartOffsetScalar =
2896         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
2897     Value *StartOffset =
2898         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2899     // Create a vector of consecutive numbers from zero to VF.
2900     StartOffset = State.Builder.CreateAdd(
2901         StartOffset, State.Builder.CreateStepVector(VecPhiType));
2902 
2903     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
2904            "scalar step must be the same across all parts");
2905     Value *GEP = State.Builder.CreateGEP(
2906         State.Builder.getInt8Ty(), NewPointerPhi,
2907         State.Builder.CreateMul(
2908             StartOffset,
2909             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
2910             "vector.gep"));
2911     State.set(this, GEP, Part);
2912   }
2913 }
2914 
2915 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2916 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2917                                           VPSlotTracker &SlotTracker) const {
2918   O << Indent << "EMIT ";
2919   printAsOperand(O, SlotTracker);
2920   O << " = WIDEN-POINTER-INDUCTION ";
2921   getStartValue()->printAsOperand(O, SlotTracker);
2922   O << ", " << *IndDesc.getStep();
2923 }
2924 #endif
2925 
2926 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2927   assert(!State.Instance && "cannot be used in per-lane");
2928   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2929   SCEVExpander Exp(SE, DL, "induction");
2930 
2931   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2932                                  &*State.Builder.GetInsertPoint());
2933   assert(!State.ExpandedSCEVs.contains(Expr) &&
2934          "Same SCEV expanded multiple times");
2935   State.ExpandedSCEVs[Expr] = Res;
2936   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2937     State.set(this, Res, {Part, 0});
2938 }
2939 
2940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2941 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2942                                VPSlotTracker &SlotTracker) const {
2943   O << Indent << "EMIT ";
2944   getVPSingleValue()->printAsOperand(O, SlotTracker);
2945   O << " = EXPAND SCEV " << *Expr;
2946 }
2947 #endif
2948 
2949 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2950   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2951   Type *STy = CanonicalIV->getType();
2952   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2953   ElementCount VF = State.VF;
2954   Value *VStart = VF.isScalar()
2955                       ? CanonicalIV
2956                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2957   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2958     Value *VStep = createStepForVF(Builder, STy, VF, Part);
2959     if (VF.isVector()) {
2960       VStep = Builder.CreateVectorSplat(VF, VStep);
2961       VStep =
2962           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2963     }
2964     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2965     State.set(this, CanonicalVectorIV, Part);
2966   }
2967 }
2968 
2969 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2970 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2971                                      VPSlotTracker &SlotTracker) const {
2972   O << Indent << "EMIT ";
2973   printAsOperand(O, SlotTracker);
2974   O << " = WIDEN-CANONICAL-INDUCTION ";
2975   printOperands(O, SlotTracker);
2976 }
2977 #endif
2978 
2979 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2980   auto &Builder = State.Builder;
2981   // Create a vector from the initial value.
2982   auto *VectorInit = getStartValue()->getLiveInIRValue();
2983 
2984   Type *VecTy = State.VF.isScalar()
2985                     ? VectorInit->getType()
2986                     : VectorType::get(VectorInit->getType(), State.VF);
2987 
2988   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2989   if (State.VF.isVector()) {
2990     auto *IdxTy = Builder.getInt32Ty();
2991     auto *One = ConstantInt::get(IdxTy, 1);
2992     IRBuilder<>::InsertPointGuard Guard(Builder);
2993     Builder.SetInsertPoint(VectorPH->getTerminator());
2994     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2995     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2996     VectorInit = Builder.CreateInsertElement(
2997         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2998   }
2999 
3000   // Create a phi node for the new recurrence.
3001   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
3002   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3003   EntryPart->addIncoming(VectorInit, VectorPH);
3004   State.set(this, EntryPart, 0);
3005 }
3006 
3007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3008 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3009                                             VPSlotTracker &SlotTracker) const {
3010   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3011   printAsOperand(O, SlotTracker);
3012   O << " = phi ";
3013   printOperands(O, SlotTracker);
3014 }
3015 #endif
3016 
3017 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3018   auto &Builder = State.Builder;
3019 
3020   // Reductions do not have to start at zero. They can start with
3021   // any loop invariant values.
3022   VPValue *StartVPV = getStartValue();
3023   Value *StartV = StartVPV->getLiveInIRValue();
3024 
3025   // In order to support recurrences we need to be able to vectorize Phi nodes.
3026   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3027   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3028   // this value when we vectorize all of the instructions that use the PHI.
3029   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3030   Type *VecTy = ScalarPHI ? StartV->getType()
3031                           : VectorType::get(StartV->getType(), State.VF);
3032 
3033   BasicBlock *HeaderBB = State.CFG.PrevBB;
3034   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3035          "recipe must be in the vector loop header");
3036   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
3037   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3038     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
3039     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
3040     State.set(this, EntryPart, Part, IsInLoop);
3041   }
3042 
3043   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3044 
3045   Value *Iden = nullptr;
3046   RecurKind RK = RdxDesc.getRecurrenceKind();
3047   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3048       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3049     // MinMax and AnyOf reductions have the start value as their identity.
3050     if (ScalarPHI) {
3051       Iden = StartV;
3052     } else {
3053       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3054       Builder.SetInsertPoint(VectorPH->getTerminator());
3055       StartV = Iden =
3056           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
3057     }
3058   } else {
3059     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3060                                        RdxDesc.getFastMathFlags());
3061 
3062     if (!ScalarPHI) {
3063       Iden = Builder.CreateVectorSplat(State.VF, Iden);
3064       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3065       Builder.SetInsertPoint(VectorPH->getTerminator());
3066       Constant *Zero = Builder.getInt32(0);
3067       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3068     }
3069   }
3070 
3071   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3072     Value *EntryPart = State.get(this, Part, IsInLoop);
3073     // Make sure to add the reduction start value only to the
3074     // first unroll part.
3075     Value *StartVal = (Part == 0) ? StartV : Iden;
3076     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
3077   }
3078 }
3079 
3080 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3081 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3082                                  VPSlotTracker &SlotTracker) const {
3083   O << Indent << "WIDEN-REDUCTION-PHI ";
3084 
3085   printAsOperand(O, SlotTracker);
3086   O << " = phi ";
3087   printOperands(O, SlotTracker);
3088 }
3089 #endif
3090 
3091 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3092   assert(EnableVPlanNativePath &&
3093          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3094 
3095   Value *Op0 = State.get(getOperand(0), 0);
3096   Type *VecTy = Op0->getType();
3097   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3098   State.set(this, VecPhi, 0);
3099 }
3100 
3101 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3102 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3103                              VPSlotTracker &SlotTracker) const {
3104   O << Indent << "WIDEN-PHI ";
3105 
3106   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3107   // Unless all incoming values are modeled in VPlan  print the original PHI
3108   // directly.
3109   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3110   // values as VPValues.
3111   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3112     O << VPlanIngredient(OriginalPhi);
3113     return;
3114   }
3115 
3116   printAsOperand(O, SlotTracker);
3117   O << " = phi ";
3118   printOperands(O, SlotTracker);
3119 }
3120 #endif
3121 
3122 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3123 // remove VPActiveLaneMaskPHIRecipe.
3124 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3125   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3126   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
3127     Value *StartMask = State.get(getOperand(0), Part);
3128     PHINode *EntryPart =
3129         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3130     EntryPart->addIncoming(StartMask, VectorPH);
3131     EntryPart->setDebugLoc(getDebugLoc());
3132     State.set(this, EntryPart, Part);
3133   }
3134 }
3135 
3136 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3137 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3138                                       VPSlotTracker &SlotTracker) const {
3139   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3140 
3141   printAsOperand(O, SlotTracker);
3142   O << " = phi ";
3143   printOperands(O, SlotTracker);
3144 }
3145 #endif
3146 
3147 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3148   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3149   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
3150   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
3151   PHINode *EntryPart =
3152       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3153   EntryPart->addIncoming(Start, VectorPH);
3154   EntryPart->setDebugLoc(getDebugLoc());
3155   State.set(this, EntryPart, 0, /*IsScalar=*/true);
3156 }
3157 
3158 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3159 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3160                                   VPSlotTracker &SlotTracker) const {
3161   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3162 
3163   printAsOperand(O, SlotTracker);
3164   O << " = phi ";
3165   printOperands(O, SlotTracker);
3166 }
3167 #endif
3168