xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision f0c5caa8144307ec59fbafeed1ba37bb3603b00f)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Type.h"
26 #include "llvm/IR/Value.h"
27 #include "llvm/IR/VectorBuilder.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
33 #include "llvm/Transforms/Utils/LoopUtils.h"
34 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
35 #include <cassert>
36 
37 using namespace llvm;
38 
39 using VectorParts = SmallVector<Value *, 2>;
40 
41 namespace llvm {
42 extern cl::opt<bool> EnableVPlanNativePath;
43 }
44 extern cl::opt<unsigned> ForceTargetInstructionCost;
45 
46 #define LV_NAME "loop-vectorize"
47 #define DEBUG_TYPE LV_NAME
48 
49 bool VPRecipeBase::mayWriteToMemory() const {
50   switch (getVPDefID()) {
51   case VPInterleaveSC:
52     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
53   case VPWidenStoreEVLSC:
54   case VPWidenStoreSC:
55     return true;
56   case VPReplicateSC:
57     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
58         ->mayWriteToMemory();
59   case VPWidenCallSC:
60     return !cast<VPWidenCallRecipe>(this)
61                 ->getCalledScalarFunction()
62                 ->onlyReadsMemory();
63   case VPBranchOnMaskSC:
64   case VPScalarIVStepsSC:
65   case VPPredInstPHISC:
66     return false;
67   case VPBlendSC:
68   case VPReductionEVLSC:
69   case VPReductionSC:
70   case VPWidenCanonicalIVSC:
71   case VPWidenCastSC:
72   case VPWidenGEPSC:
73   case VPWidenIntOrFpInductionSC:
74   case VPWidenLoadEVLSC:
75   case VPWidenLoadSC:
76   case VPWidenPHISC:
77   case VPWidenSC:
78   case VPWidenEVLSC:
79   case VPWidenSelectSC: {
80     const Instruction *I =
81         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
82     (void)I;
83     assert((!I || !I->mayWriteToMemory()) &&
84            "underlying instruction may write to memory");
85     return false;
86   }
87   default:
88     return true;
89   }
90 }
91 
92 bool VPRecipeBase::mayReadFromMemory() const {
93   switch (getVPDefID()) {
94   case VPWidenLoadEVLSC:
95   case VPWidenLoadSC:
96     return true;
97   case VPReplicateSC:
98     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
99         ->mayReadFromMemory();
100   case VPWidenCallSC:
101     return !cast<VPWidenCallRecipe>(this)
102                 ->getCalledScalarFunction()
103                 ->onlyWritesMemory();
104   case VPBranchOnMaskSC:
105   case VPPredInstPHISC:
106   case VPScalarIVStepsSC:
107   case VPWidenStoreEVLSC:
108   case VPWidenStoreSC:
109     return false;
110   case VPBlendSC:
111   case VPReductionEVLSC:
112   case VPReductionSC:
113   case VPWidenCanonicalIVSC:
114   case VPWidenCastSC:
115   case VPWidenGEPSC:
116   case VPWidenIntOrFpInductionSC:
117   case VPWidenPHISC:
118   case VPWidenSC:
119   case VPWidenEVLSC:
120   case VPWidenSelectSC: {
121     const Instruction *I =
122         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123     (void)I;
124     assert((!I || !I->mayReadFromMemory()) &&
125            "underlying instruction may read from memory");
126     return false;
127   }
128   default:
129     return true;
130   }
131 }
132 
133 bool VPRecipeBase::mayHaveSideEffects() const {
134   switch (getVPDefID()) {
135   case VPDerivedIVSC:
136   case VPPredInstPHISC:
137   case VPScalarCastSC:
138     return false;
139   case VPInstructionSC:
140     switch (cast<VPInstruction>(this)->getOpcode()) {
141     case Instruction::Or:
142     case Instruction::ICmp:
143     case Instruction::Select:
144     case VPInstruction::Not:
145     case VPInstruction::CalculateTripCountMinusVF:
146     case VPInstruction::CanonicalIVIncrementForPart:
147     case VPInstruction::ExtractFromEnd:
148     case VPInstruction::FirstOrderRecurrenceSplice:
149     case VPInstruction::LogicalAnd:
150     case VPInstruction::PtrAdd:
151       return false;
152     default:
153       return true;
154     }
155   case VPWidenCallSC: {
156     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
157     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
158   }
159   case VPBlendSC:
160   case VPReductionEVLSC:
161   case VPReductionSC:
162   case VPScalarIVStepsSC:
163   case VPWidenCanonicalIVSC:
164   case VPWidenCastSC:
165   case VPWidenGEPSC:
166   case VPWidenIntOrFpInductionSC:
167   case VPWidenPHISC:
168   case VPWidenPointerInductionSC:
169   case VPWidenSC:
170   case VPWidenEVLSC:
171   case VPWidenSelectSC: {
172     const Instruction *I =
173         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
174     (void)I;
175     assert((!I || !I->mayHaveSideEffects()) &&
176            "underlying instruction has side-effects");
177     return false;
178   }
179   case VPInterleaveSC:
180     return mayWriteToMemory();
181   case VPWidenLoadEVLSC:
182   case VPWidenLoadSC:
183   case VPWidenStoreEVLSC:
184   case VPWidenStoreSC:
185     assert(
186         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
187             mayWriteToMemory() &&
188         "mayHaveSideffects result for ingredient differs from this "
189         "implementation");
190     return mayWriteToMemory();
191   case VPReplicateSC: {
192     auto *R = cast<VPReplicateRecipe>(this);
193     return R->getUnderlyingInstr()->mayHaveSideEffects();
194   }
195   default:
196     return true;
197   }
198 }
199 
200 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
201   VPValue *ExitValue = getOperand(0);
202   VPBasicBlock *MiddleVPBB =
203       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
204   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
205   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
206   // Values leaving the vector loop reach live out phi's in the exiting block
207   // via middle block.
208   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
209                        ? MiddleVPBB
210                        : ExitingVPBB;
211   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
212   Value *V = State.get(ExitValue, VPIteration(0, 0));
213   if (Phi->getBasicBlockIndex(PredBB) != -1)
214     Phi->setIncomingValueForBlock(PredBB, V);
215   else
216     Phi->addIncoming(V, PredBB);
217 }
218 
219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
220 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
221   O << "Live-out ";
222   getPhi()->printAsOperand(O);
223   O << " = ";
224   getOperand(0)->printAsOperand(O, SlotTracker);
225   O << "\n";
226 }
227 #endif
228 
229 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
230   assert(!Parent && "Recipe already in some VPBasicBlock");
231   assert(InsertPos->getParent() &&
232          "Insertion position not in any VPBasicBlock");
233   InsertPos->getParent()->insert(this, InsertPos->getIterator());
234 }
235 
236 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237                                 iplist<VPRecipeBase>::iterator I) {
238   assert(!Parent && "Recipe already in some VPBasicBlock");
239   assert(I == BB.end() || I->getParent() == &BB);
240   BB.insert(this, I);
241 }
242 
243 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
244   assert(!Parent && "Recipe already in some VPBasicBlock");
245   assert(InsertPos->getParent() &&
246          "Insertion position not in any VPBasicBlock");
247   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
248 }
249 
250 void VPRecipeBase::removeFromParent() {
251   assert(getParent() && "Recipe not in any VPBasicBlock");
252   getParent()->getRecipeList().remove(getIterator());
253   Parent = nullptr;
254 }
255 
256 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
257   assert(getParent() && "Recipe not in any VPBasicBlock");
258   return getParent()->getRecipeList().erase(getIterator());
259 }
260 
261 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
262   removeFromParent();
263   insertAfter(InsertPos);
264 }
265 
266 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
267                               iplist<VPRecipeBase>::iterator I) {
268   removeFromParent();
269   insertBefore(BB, I);
270 }
271 
272 /// Return the underlying instruction to be used for computing \p R's cost via
273 /// the legacy cost model. Return nullptr if there's no suitable instruction.
274 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
275   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
276     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
277   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
278     return IG->getInsertPos();
279   // Currently the legacy cost model only calculates the instruction cost with
280   // underlying instruction. Removing the WidenMem here will prevent
281   // force-target-instruction-cost overwriting the cost of recipe with
282   // underlying instruction which is inconsistent with the legacy model.
283   // TODO: Remove WidenMem from this function when we don't need to compare to
284   // the legacy model.
285   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
286     return &WidenMem->getIngredient();
287   return nullptr;
288 }
289 
290 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
291   auto *UI = getInstructionForCost(this);
292   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
293     return 0;
294 
295   InstructionCost RecipeCost = computeCost(VF, Ctx);
296   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
297       RecipeCost.isValid())
298     RecipeCost = InstructionCost(ForceTargetInstructionCost);
299 
300   LLVM_DEBUG({
301     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302     dump();
303   });
304   return RecipeCost;
305 }
306 
307 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
308                                           VPCostContext &Ctx) const {
309   // Compute the cost for the recipe falling back to the legacy cost model using
310   // the underlying instruction. If there is no underlying instruction, returns
311   // 0.
312   Instruction *UI = getInstructionForCost(this);
313   if (UI && isa<VPReplicateRecipe>(this)) {
314     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
315     // transform, avoid computing their cost multiple times for now.
316     Ctx.SkipCostComputation.insert(UI);
317   }
318   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
319 }
320 
321 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
322   assert(OpType == OperationType::FPMathOp &&
323          "recipe doesn't have fast math flags");
324   FastMathFlags Res;
325   Res.setAllowReassoc(FMFs.AllowReassoc);
326   Res.setNoNaNs(FMFs.NoNaNs);
327   Res.setNoInfs(FMFs.NoInfs);
328   Res.setNoSignedZeros(FMFs.NoSignedZeros);
329   Res.setAllowReciprocal(FMFs.AllowReciprocal);
330   Res.setAllowContract(FMFs.AllowContract);
331   Res.setApproxFunc(FMFs.ApproxFunc);
332   return Res;
333 }
334 
335 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
336                              VPValue *A, VPValue *B, DebugLoc DL,
337                              const Twine &Name)
338     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
339                           Pred, DL),
340       Opcode(Opcode), Name(Name.str()) {
341   assert(Opcode == Instruction::ICmp &&
342          "only ICmp predicates supported at the moment");
343 }
344 
345 VPInstruction::VPInstruction(unsigned Opcode,
346                              std::initializer_list<VPValue *> Operands,
347                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
348     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
349       Opcode(Opcode), Name(Name.str()) {
350   // Make sure the VPInstruction is a floating-point operation.
351   assert(isFPMathOp() && "this op can't take fast-math flags");
352 }
353 
354 bool VPInstruction::doesGeneratePerAllLanes() const {
355   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
356 }
357 
358 bool VPInstruction::canGenerateScalarForFirstLane() const {
359   if (Instruction::isBinaryOp(getOpcode()))
360     return true;
361   if (isSingleScalar() || isVectorToScalar())
362     return true;
363   switch (Opcode) {
364   case Instruction::ICmp:
365   case VPInstruction::BranchOnCond:
366   case VPInstruction::BranchOnCount:
367   case VPInstruction::CalculateTripCountMinusVF:
368   case VPInstruction::CanonicalIVIncrementForPart:
369   case VPInstruction::PtrAdd:
370   case VPInstruction::ExplicitVectorLength:
371     return true;
372   default:
373     return false;
374   }
375 }
376 
377 Value *VPInstruction::generatePerLane(VPTransformState &State,
378                                       const VPIteration &Lane) {
379   IRBuilderBase &Builder = State.Builder;
380 
381   assert(getOpcode() == VPInstruction::PtrAdd &&
382          "only PtrAdd opcodes are supported for now");
383   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
384                               State.get(getOperand(1), Lane), Name);
385 }
386 
387 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
388   IRBuilderBase &Builder = State.Builder;
389 
390   if (Instruction::isBinaryOp(getOpcode())) {
391     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
392     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
393     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
394     auto *Res =
395         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
396     if (auto *I = dyn_cast<Instruction>(Res))
397       setFlags(I);
398     return Res;
399   }
400 
401   switch (getOpcode()) {
402   case VPInstruction::Not: {
403     Value *A = State.get(getOperand(0), Part);
404     return Builder.CreateNot(A, Name);
405   }
406   case Instruction::ICmp: {
407     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
408     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
409     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
410     return Builder.CreateCmp(getPredicate(), A, B, Name);
411   }
412   case Instruction::Select: {
413     Value *Cond = State.get(getOperand(0), Part);
414     Value *Op1 = State.get(getOperand(1), Part);
415     Value *Op2 = State.get(getOperand(2), Part);
416     return Builder.CreateSelect(Cond, Op1, Op2, Name);
417   }
418   case VPInstruction::ActiveLaneMask: {
419     // Get first lane of vector induction variable.
420     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
421     // Get the original loop tripcount.
422     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
423 
424     // If this part of the active lane mask is scalar, generate the CMP directly
425     // to avoid unnecessary extracts.
426     if (State.VF.isScalar())
427       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
428                                Name);
429 
430     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
431     auto *PredTy = VectorType::get(Int1Ty, State.VF);
432     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
433                                    {PredTy, ScalarTC->getType()},
434                                    {VIVElem0, ScalarTC}, nullptr, Name);
435   }
436   case VPInstruction::FirstOrderRecurrenceSplice: {
437     // Generate code to combine the previous and current values in vector v3.
438     //
439     //   vector.ph:
440     //     v_init = vector(..., ..., ..., a[-1])
441     //     br vector.body
442     //
443     //   vector.body
444     //     i = phi [0, vector.ph], [i+4, vector.body]
445     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
446     //     v2 = a[i, i+1, i+2, i+3];
447     //     v3 = vector(v1(3), v2(0, 1, 2))
448 
449     // For the first part, use the recurrence phi (v1), otherwise v2.
450     auto *V1 = State.get(getOperand(0), 0);
451     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
452     if (!PartMinus1->getType()->isVectorTy())
453       return PartMinus1;
454     Value *V2 = State.get(getOperand(1), Part);
455     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
456   }
457   case VPInstruction::CalculateTripCountMinusVF: {
458     if (Part != 0)
459       return State.get(this, 0, /*IsScalar*/ true);
460 
461     Value *ScalarTC = State.get(getOperand(0), {0, 0});
462     Value *Step =
463         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
464     Value *Sub = Builder.CreateSub(ScalarTC, Step);
465     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
466     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
467     return Builder.CreateSelect(Cmp, Sub, Zero);
468   }
469   case VPInstruction::ExplicitVectorLength: {
470     // Compute EVL
471     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
472       assert(AVL->getType()->isIntegerTy() &&
473              "Requested vector length should be an integer.");
474 
475       // TODO: Add support for MaxSafeDist for correct loop emission.
476       assert(State.VF.isScalable() && "Expected scalable vector factor.");
477       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
478 
479       Value *EVL = State.Builder.CreateIntrinsic(
480           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
481           {AVL, VFArg, State.Builder.getTrue()});
482       return EVL;
483     };
484     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
485     // be outside of the main loop.
486     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
487     // Compute VTC - IV as the AVL (requested vector length).
488     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
489     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
490     Value *AVL = State.Builder.CreateSub(TripCount, Index);
491     Value *EVL = GetEVL(State, AVL);
492     return EVL;
493   }
494   case VPInstruction::CanonicalIVIncrementForPart: {
495     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
496     if (Part == 0)
497       return IV;
498 
499     // The canonical IV is incremented by the vectorization factor (num of SIMD
500     // elements) times the unroll part.
501     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
502     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
503                              hasNoSignedWrap());
504   }
505   case VPInstruction::BranchOnCond: {
506     if (Part != 0)
507       return nullptr;
508 
509     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
510     // Replace the temporary unreachable terminator with a new conditional
511     // branch, hooking it up to backward destination for exiting blocks now and
512     // to forward destination(s) later when they are created.
513     BranchInst *CondBr =
514         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
515     CondBr->setSuccessor(0, nullptr);
516     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
517 
518     if (!getParent()->isExiting())
519       return CondBr;
520 
521     VPRegionBlock *ParentRegion = getParent()->getParent();
522     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
523     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
524     return CondBr;
525   }
526   case VPInstruction::BranchOnCount: {
527     if (Part != 0)
528       return nullptr;
529     // First create the compare.
530     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
531     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
532     Value *Cond = Builder.CreateICmpEQ(IV, TC);
533 
534     // Now create the branch.
535     auto *Plan = getParent()->getPlan();
536     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
537     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
538 
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination (the header) now and to the
541     // forward destination (the exit/middle block) later when it is created.
542     // Note that CreateCondBr expects a valid BB as first argument, so we need
543     // to set it to nullptr later.
544     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
545                                               State.CFG.VPBB2IRBB[Header]);
546     CondBr->setSuccessor(0, nullptr);
547     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
548     return CondBr;
549   }
550   case VPInstruction::ComputeReductionResult: {
551     if (Part != 0)
552       return State.get(this, 0, /*IsScalar*/ true);
553 
554     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
555     // and will be removed by breaking up the recipe further.
556     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
557     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
558     // Get its reduction variable descriptor.
559     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
560 
561     RecurKind RK = RdxDesc.getRecurrenceKind();
562 
563     VPValue *LoopExitingDef = getOperand(1);
564     Type *PhiTy = OrigPhi->getType();
565     VectorParts RdxParts(State.UF);
566     for (unsigned Part = 0; Part < State.UF; ++Part)
567       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
568 
569     // If the vector reduction can be performed in a smaller type, we truncate
570     // then extend the loop exit value to enable InstCombine to evaluate the
571     // entire expression in the smaller type.
572     // TODO: Handle this in truncateToMinBW.
573     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
574       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
575       for (unsigned Part = 0; Part < State.UF; ++Part)
576         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
577     }
578     // Reduce all of the unrolled parts into a single vector.
579     Value *ReducedPartRdx = RdxParts[0];
580     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
581     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
582       Op = Instruction::Or;
583 
584     if (PhiR->isOrdered()) {
585       ReducedPartRdx = RdxParts[State.UF - 1];
586     } else {
587       // Floating-point operations should have some FMF to enable the reduction.
588       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
589       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
590       for (unsigned Part = 1; Part < State.UF; ++Part) {
591         Value *RdxPart = RdxParts[Part];
592         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
593           ReducedPartRdx = Builder.CreateBinOp(
594               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
595         else
596           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
597       }
598     }
599 
600     // Create the reduction after the loop. Note that inloop reductions create
601     // the target reduction in the loop using a Reduction recipe.
602     if ((State.VF.isVector() ||
603          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
604         !PhiR->isInLoop()) {
605       ReducedPartRdx =
606           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
607       // If the reduction can be performed in a smaller type, we need to extend
608       // the reduction to the wider type before we branch to the original loop.
609       if (PhiTy != RdxDesc.getRecurrenceType())
610         ReducedPartRdx = RdxDesc.isSigned()
611                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
612                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
613     }
614 
615     // If there were stores of the reduction value to a uniform memory address
616     // inside the loop, create the final store here.
617     if (StoreInst *SI = RdxDesc.IntermediateStore) {
618       auto *NewSI = Builder.CreateAlignedStore(
619           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
620       propagateMetadata(NewSI, SI);
621     }
622 
623     return ReducedPartRdx;
624   }
625   case VPInstruction::ExtractFromEnd: {
626     if (Part != 0)
627       return State.get(this, 0, /*IsScalar*/ true);
628 
629     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
630     unsigned Offset = CI->getZExtValue();
631     assert(Offset > 0 && "Offset from end must be positive");
632     Value *Res;
633     if (State.VF.isVector()) {
634       assert(Offset <= State.VF.getKnownMinValue() &&
635              "invalid offset to extract from");
636       // Extract lane VF - Offset from the operand.
637       Res = State.get(
638           getOperand(0),
639           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
640     } else {
641       assert(Offset <= State.UF && "invalid offset to extract from");
642       // When loop is unrolled without vectorizing, retrieve UF - Offset.
643       Res = State.get(getOperand(0), State.UF - Offset);
644     }
645     if (isa<ExtractElementInst>(Res))
646       Res->setName(Name);
647     return Res;
648   }
649   case VPInstruction::LogicalAnd: {
650     Value *A = State.get(getOperand(0), Part);
651     Value *B = State.get(getOperand(1), Part);
652     return Builder.CreateLogicalAnd(A, B, Name);
653   }
654   case VPInstruction::PtrAdd: {
655     assert(vputils::onlyFirstLaneUsed(this) &&
656            "can only generate first lane for PtrAdd");
657     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
658     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
659     return Builder.CreatePtrAdd(Ptr, Addend, Name);
660   }
661   case VPInstruction::ResumePhi: {
662     if (Part != 0)
663       return State.get(this, 0, /*IsScalar*/ true);
664     Value *IncomingFromVPlanPred =
665         State.get(getOperand(0), Part, /* IsScalar */ true);
666     Value *IncomingFromOtherPreds =
667         State.get(getOperand(1), Part, /* IsScalar */ true);
668     auto *NewPhi =
669         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
670     BasicBlock *VPlanPred =
671         State.CFG
672             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
673     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
674     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
675       assert(OtherPred != VPlanPred &&
676              "VPlan predecessors should not be connected yet");
677       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
678     }
679     return NewPhi;
680   }
681 
682   default:
683     llvm_unreachable("Unsupported opcode for instruction");
684   }
685 }
686 
687 bool VPInstruction::isVectorToScalar() const {
688   return getOpcode() == VPInstruction::ExtractFromEnd ||
689          getOpcode() == VPInstruction::ComputeReductionResult;
690 }
691 
692 bool VPInstruction::isSingleScalar() const {
693   return getOpcode() == VPInstruction::ResumePhi;
694 }
695 
696 #if !defined(NDEBUG)
697 bool VPInstruction::isFPMathOp() const {
698   // Inspired by FPMathOperator::classof. Notable differences are that we don't
699   // support Call, PHI and Select opcodes here yet.
700   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
701          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
702          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
703          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
704 }
705 #endif
706 
707 void VPInstruction::execute(VPTransformState &State) {
708   assert(!State.Instance && "VPInstruction executing an Instance");
709   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
710   assert((hasFastMathFlags() == isFPMathOp() ||
711           getOpcode() == Instruction::Select) &&
712          "Recipe not a FPMathOp but has fast-math flags?");
713   if (hasFastMathFlags())
714     State.Builder.setFastMathFlags(getFastMathFlags());
715   State.setDebugLocFrom(getDebugLoc());
716   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
717                                    (vputils::onlyFirstLaneUsed(this) ||
718                                     isVectorToScalar() || isSingleScalar());
719   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
720   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
721   for (unsigned Part = 0; Part < State.UF; ++Part) {
722     if (GeneratesPerAllLanes) {
723       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
724            Lane != NumLanes; ++Lane) {
725         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
726         assert(GeneratedValue && "generatePerLane must produce a value");
727         State.set(this, GeneratedValue, VPIteration(Part, Lane));
728       }
729       continue;
730     }
731 
732     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
733       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
734       State.set(this, Part0, Part,
735                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
736       continue;
737     }
738 
739     Value *GeneratedValue = generatePerPart(State, Part);
740     if (!hasResult())
741       continue;
742     assert(GeneratedValue && "generatePerPart must produce a value");
743     assert((GeneratedValue->getType()->isVectorTy() ==
744                 !GeneratesPerFirstLaneOnly ||
745             State.VF.isScalar()) &&
746            "scalar value but not only first lane defined");
747     State.set(this, GeneratedValue, Part,
748               /*IsScalar*/ GeneratesPerFirstLaneOnly);
749   }
750 }
751 
752 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
753   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
754   if (Instruction::isBinaryOp(getOpcode()))
755     return vputils::onlyFirstLaneUsed(this);
756 
757   switch (getOpcode()) {
758   default:
759     return false;
760   case Instruction::ICmp:
761   case VPInstruction::PtrAdd:
762     // TODO: Cover additional opcodes.
763     return vputils::onlyFirstLaneUsed(this);
764   case VPInstruction::ActiveLaneMask:
765   case VPInstruction::ExplicitVectorLength:
766   case VPInstruction::CalculateTripCountMinusVF:
767   case VPInstruction::CanonicalIVIncrementForPart:
768   case VPInstruction::BranchOnCount:
769   case VPInstruction::BranchOnCond:
770   case VPInstruction::ResumePhi:
771     return true;
772   };
773   llvm_unreachable("switch should return");
774 }
775 
776 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
777   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
778   if (Instruction::isBinaryOp(getOpcode()))
779     return vputils::onlyFirstPartUsed(this);
780 
781   switch (getOpcode()) {
782   default:
783     return false;
784   case Instruction::ICmp:
785   case Instruction::Select:
786     return vputils::onlyFirstPartUsed(this);
787   case VPInstruction::BranchOnCount:
788   case VPInstruction::BranchOnCond:
789   case VPInstruction::CanonicalIVIncrementForPart:
790     return true;
791   };
792   llvm_unreachable("switch should return");
793 }
794 
795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
796 void VPInstruction::dump() const {
797   VPSlotTracker SlotTracker(getParent()->getPlan());
798   print(dbgs(), "", SlotTracker);
799 }
800 
801 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
802                           VPSlotTracker &SlotTracker) const {
803   O << Indent << "EMIT ";
804 
805   if (hasResult()) {
806     printAsOperand(O, SlotTracker);
807     O << " = ";
808   }
809 
810   switch (getOpcode()) {
811   case VPInstruction::Not:
812     O << "not";
813     break;
814   case VPInstruction::SLPLoad:
815     O << "combined load";
816     break;
817   case VPInstruction::SLPStore:
818     O << "combined store";
819     break;
820   case VPInstruction::ActiveLaneMask:
821     O << "active lane mask";
822     break;
823   case VPInstruction::ResumePhi:
824     O << "resume-phi";
825     break;
826   case VPInstruction::ExplicitVectorLength:
827     O << "EXPLICIT-VECTOR-LENGTH";
828     break;
829   case VPInstruction::FirstOrderRecurrenceSplice:
830     O << "first-order splice";
831     break;
832   case VPInstruction::BranchOnCond:
833     O << "branch-on-cond";
834     break;
835   case VPInstruction::CalculateTripCountMinusVF:
836     O << "TC > VF ? TC - VF : 0";
837     break;
838   case VPInstruction::CanonicalIVIncrementForPart:
839     O << "VF * Part +";
840     break;
841   case VPInstruction::BranchOnCount:
842     O << "branch-on-count";
843     break;
844   case VPInstruction::ExtractFromEnd:
845     O << "extract-from-end";
846     break;
847   case VPInstruction::ComputeReductionResult:
848     O << "compute-reduction-result";
849     break;
850   case VPInstruction::LogicalAnd:
851     O << "logical-and";
852     break;
853   case VPInstruction::PtrAdd:
854     O << "ptradd";
855     break;
856   default:
857     O << Instruction::getOpcodeName(getOpcode());
858   }
859 
860   printFlags(O);
861   printOperands(O, SlotTracker);
862 
863   if (auto DL = getDebugLoc()) {
864     O << ", !dbg ";
865     DL.print(O);
866   }
867 }
868 #endif
869 
870 void VPIRInstruction::execute(VPTransformState &State) {
871   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
872          "Only PHINodes can have extra operands");
873   if (getNumOperands() == 1) {
874     VPValue *ExitValue = getOperand(0);
875     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
876                     ? VPLane::getFirstLane()
877                     : VPLane::getLastLaneForVF(State.VF);
878     auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
879     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
880     // Set insertion point in PredBB in case an extract needs to be generated.
881     // TODO: Model extracts explicitly.
882     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
883     Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
884     auto *Phi = cast<PHINode>(&I);
885     Phi->addIncoming(V, PredBB);
886   }
887 
888   // Advance the insert point after the wrapped IR instruction. This allows
889   // interleaving VPIRInstructions and other recipes.
890   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
891 }
892 
893 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
894 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
895                             VPSlotTracker &SlotTracker) const {
896   O << Indent << "IR " << I;
897 
898   if (getNumOperands() != 0) {
899     assert(getNumOperands() == 1 && "can have at most 1 operand");
900     O << " (extra operand: ";
901     printOperands(O, SlotTracker);
902     O << ")";
903   }
904 }
905 #endif
906 
907 void VPWidenCallRecipe::execute(VPTransformState &State) {
908   assert(State.VF.isVector() && "not widening");
909   Function *CalledScalarFn = getCalledScalarFunction();
910   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
911          "DbgInfoIntrinsic should have been dropped during VPlan construction");
912   State.setDebugLocFrom(getDebugLoc());
913 
914   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
915   FunctionType *VFTy = nullptr;
916   if (Variant)
917     VFTy = Variant->getFunctionType();
918   for (unsigned Part = 0; Part < State.UF; ++Part) {
919     SmallVector<Type *, 2> TysForDecl;
920     // Add return type if intrinsic is overloaded on it.
921     if (UseIntrinsic &&
922         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
923       TysForDecl.push_back(VectorType::get(
924           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
925     SmallVector<Value *, 4> Args;
926     for (const auto &I : enumerate(arg_operands())) {
927       // Some intrinsics have a scalar argument - don't replace it with a
928       // vector.
929       Value *Arg;
930       if (UseIntrinsic &&
931           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
932         Arg = State.get(I.value(), VPIteration(0, 0));
933       // Some vectorized function variants may also take a scalar argument,
934       // e.g. linear parameters for pointers. This needs to be the scalar value
935       // from the start of the respective part when interleaving.
936       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
937         Arg = State.get(I.value(), VPIteration(Part, 0));
938       else
939         Arg = State.get(I.value(), Part);
940       if (UseIntrinsic &&
941           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
942         TysForDecl.push_back(Arg->getType());
943       Args.push_back(Arg);
944     }
945 
946     Function *VectorF;
947     if (UseIntrinsic) {
948       // Use vector version of the intrinsic.
949       Module *M = State.Builder.GetInsertBlock()->getModule();
950       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
951       assert(VectorF && "Can't retrieve vector intrinsic.");
952     } else {
953 #ifndef NDEBUG
954       assert(Variant != nullptr && "Can't create vector function.");
955 #endif
956       VectorF = Variant;
957     }
958 
959     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
960     SmallVector<OperandBundleDef, 1> OpBundles;
961     if (CI)
962       CI->getOperandBundlesAsDefs(OpBundles);
963 
964     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
965 
966     if (isa<FPMathOperator>(V))
967       V->copyFastMathFlags(CI);
968 
969     if (!V->getType()->isVoidTy())
970       State.set(this, V, Part);
971     State.addMetadata(V, CI);
972   }
973 }
974 
975 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
976                                                VPCostContext &Ctx) const {
977   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
978   if (Variant) {
979     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
980                                     Variant->getFunctionType()->params(),
981                                     CostKind);
982   }
983 
984   FastMathFlags FMF;
985   // TODO: Manage flags via VPRecipeWithIRFlags.
986   if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
987     FMF = FPMO->getFastMathFlags();
988 
989   // Some backends analyze intrinsic arguments to determine cost. Use the
990   // underlying value for the operand if it has one. Otherwise try to use the
991   // operand of the underlying call instruction, if there is one. Otherwise
992   // clear Arguments.
993   // TODO: Rework TTI interface to be independent of concrete IR values.
994   SmallVector<const Value *> Arguments;
995   for (const auto &[Idx, Op] : enumerate(operands())) {
996     auto *V = Op->getUnderlyingValue();
997     if (!V) {
998       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
999         Arguments.push_back(UI->getArgOperand(Idx));
1000         continue;
1001       }
1002       Arguments.clear();
1003       break;
1004     }
1005     Arguments.push_back(V);
1006   }
1007 
1008   Type *RetTy =
1009       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1010   SmallVector<Type *> ParamTys;
1011   for (unsigned I = 0; I != getNumOperands(); ++I)
1012     ParamTys.push_back(
1013         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1014 
1015   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1016   IntrinsicCostAttributes CostAttrs(
1017       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1018       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1019   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
1020 }
1021 
1022 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1023 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1024                               VPSlotTracker &SlotTracker) const {
1025   O << Indent << "WIDEN-CALL ";
1026 
1027   Function *CalledFn = getCalledScalarFunction();
1028   if (CalledFn->getReturnType()->isVoidTy())
1029     O << "void ";
1030   else {
1031     printAsOperand(O, SlotTracker);
1032     O << " = ";
1033   }
1034 
1035   O << "call @" << CalledFn->getName() << "(";
1036   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1037     Op->printAsOperand(O, SlotTracker);
1038   });
1039   O << ")";
1040 
1041   if (VectorIntrinsicID)
1042     O << " (using vector intrinsic)";
1043   else {
1044     O << " (using library function";
1045     if (Variant->hasName())
1046       O << ": " << Variant->getName();
1047     O << ")";
1048   }
1049 }
1050 
1051 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1052                                 VPSlotTracker &SlotTracker) const {
1053   O << Indent << "WIDEN-SELECT ";
1054   printAsOperand(O, SlotTracker);
1055   O << " = select ";
1056   getOperand(0)->printAsOperand(O, SlotTracker);
1057   O << ", ";
1058   getOperand(1)->printAsOperand(O, SlotTracker);
1059   O << ", ";
1060   getOperand(2)->printAsOperand(O, SlotTracker);
1061   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1062 }
1063 #endif
1064 
1065 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1066   State.setDebugLocFrom(getDebugLoc());
1067 
1068   // The condition can be loop invariant but still defined inside the
1069   // loop. This means that we can't just use the original 'cond' value.
1070   // We have to take the 'vectorized' value and pick the first lane.
1071   // Instcombine will make this a no-op.
1072   auto *InvarCond =
1073       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
1074 
1075   for (unsigned Part = 0; Part < State.UF; ++Part) {
1076     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
1077     Value *Op0 = State.get(getOperand(1), Part);
1078     Value *Op1 = State.get(getOperand(2), Part);
1079     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1080     State.set(this, Sel, Part);
1081     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1082   }
1083 }
1084 
1085 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1086     const FastMathFlags &FMF) {
1087   AllowReassoc = FMF.allowReassoc();
1088   NoNaNs = FMF.noNaNs();
1089   NoInfs = FMF.noInfs();
1090   NoSignedZeros = FMF.noSignedZeros();
1091   AllowReciprocal = FMF.allowReciprocal();
1092   AllowContract = FMF.allowContract();
1093   ApproxFunc = FMF.approxFunc();
1094 }
1095 
1096 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1097 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1098   switch (OpType) {
1099   case OperationType::Cmp:
1100     O << " " << CmpInst::getPredicateName(getPredicate());
1101     break;
1102   case OperationType::DisjointOp:
1103     if (DisjointFlags.IsDisjoint)
1104       O << " disjoint";
1105     break;
1106   case OperationType::PossiblyExactOp:
1107     if (ExactFlags.IsExact)
1108       O << " exact";
1109     break;
1110   case OperationType::OverflowingBinOp:
1111     if (WrapFlags.HasNUW)
1112       O << " nuw";
1113     if (WrapFlags.HasNSW)
1114       O << " nsw";
1115     break;
1116   case OperationType::FPMathOp:
1117     getFastMathFlags().print(O);
1118     break;
1119   case OperationType::GEPOp:
1120     if (GEPFlags.IsInBounds)
1121       O << " inbounds";
1122     break;
1123   case OperationType::NonNegOp:
1124     if (NonNegFlags.NonNeg)
1125       O << " nneg";
1126     break;
1127   case OperationType::Other:
1128     break;
1129   }
1130   if (getNumOperands() > 0)
1131     O << " ";
1132 }
1133 #endif
1134 
1135 void VPWidenRecipe::execute(VPTransformState &State) {
1136   State.setDebugLocFrom(getDebugLoc());
1137   auto &Builder = State.Builder;
1138   switch (Opcode) {
1139   case Instruction::Call:
1140   case Instruction::Br:
1141   case Instruction::PHI:
1142   case Instruction::GetElementPtr:
1143   case Instruction::Select:
1144     llvm_unreachable("This instruction is handled by a different recipe.");
1145   case Instruction::UDiv:
1146   case Instruction::SDiv:
1147   case Instruction::SRem:
1148   case Instruction::URem:
1149   case Instruction::Add:
1150   case Instruction::FAdd:
1151   case Instruction::Sub:
1152   case Instruction::FSub:
1153   case Instruction::FNeg:
1154   case Instruction::Mul:
1155   case Instruction::FMul:
1156   case Instruction::FDiv:
1157   case Instruction::FRem:
1158   case Instruction::Shl:
1159   case Instruction::LShr:
1160   case Instruction::AShr:
1161   case Instruction::And:
1162   case Instruction::Or:
1163   case Instruction::Xor: {
1164     // Just widen unops and binops.
1165     for (unsigned Part = 0; Part < State.UF; ++Part) {
1166       SmallVector<Value *, 2> Ops;
1167       for (VPValue *VPOp : operands())
1168         Ops.push_back(State.get(VPOp, Part));
1169 
1170       Value *V = Builder.CreateNAryOp(Opcode, Ops);
1171 
1172       if (auto *VecOp = dyn_cast<Instruction>(V))
1173         setFlags(VecOp);
1174 
1175       // Use this vector value for all users of the original instruction.
1176       State.set(this, V, Part);
1177       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1178     }
1179 
1180     break;
1181   }
1182   case Instruction::Freeze: {
1183     for (unsigned Part = 0; Part < State.UF; ++Part) {
1184       Value *Op = State.get(getOperand(0), Part);
1185 
1186       Value *Freeze = Builder.CreateFreeze(Op);
1187       State.set(this, Freeze, Part);
1188     }
1189     break;
1190   }
1191   case Instruction::ICmp:
1192   case Instruction::FCmp: {
1193     // Widen compares. Generate vector compares.
1194     bool FCmp = Opcode == Instruction::FCmp;
1195     for (unsigned Part = 0; Part < State.UF; ++Part) {
1196       Value *A = State.get(getOperand(0), Part);
1197       Value *B = State.get(getOperand(1), Part);
1198       Value *C = nullptr;
1199       if (FCmp) {
1200         // Propagate fast math flags.
1201         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1202         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1203           Builder.setFastMathFlags(I->getFastMathFlags());
1204         C = Builder.CreateFCmp(getPredicate(), A, B);
1205       } else {
1206         C = Builder.CreateICmp(getPredicate(), A, B);
1207       }
1208       State.set(this, C, Part);
1209       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1210     }
1211 
1212     break;
1213   }
1214   default:
1215     // This instruction is not vectorized by simple widening.
1216     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1217                       << Instruction::getOpcodeName(Opcode));
1218     llvm_unreachable("Unhandled instruction!");
1219   } // end of switch.
1220 
1221 #if !defined(NDEBUG)
1222   // Verify that VPlan type inference results agree with the type of the
1223   // generated values.
1224   for (unsigned Part = 0; Part < State.UF; ++Part) {
1225     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1226                            State.VF) == State.get(this, Part)->getType() &&
1227            "inferred type and type from generated instructions do not match");
1228   }
1229 #endif
1230 }
1231 
1232 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1233                                            VPCostContext &Ctx) const {
1234   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1235   switch (Opcode) {
1236   case Instruction::FNeg: {
1237     Type *VectorTy =
1238         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1239     return Ctx.TTI.getArithmeticInstrCost(
1240         Opcode, VectorTy, CostKind,
1241         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1242         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1243   }
1244 
1245   case Instruction::UDiv:
1246   case Instruction::SDiv:
1247   case Instruction::SRem:
1248   case Instruction::URem:
1249     // More complex computation, let the legacy cost-model handle this for now.
1250     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1251   case Instruction::Add:
1252   case Instruction::FAdd:
1253   case Instruction::Sub:
1254   case Instruction::FSub:
1255   case Instruction::Mul:
1256   case Instruction::FMul:
1257   case Instruction::FDiv:
1258   case Instruction::FRem:
1259   case Instruction::Shl:
1260   case Instruction::LShr:
1261   case Instruction::AShr:
1262   case Instruction::And:
1263   case Instruction::Or:
1264   case Instruction::Xor: {
1265     VPValue *RHS = getOperand(1);
1266     // Certain instructions can be cheaper to vectorize if they have a constant
1267     // second vector operand. One example of this are shifts on x86.
1268     TargetTransformInfo::OperandValueInfo RHSInfo = {
1269         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1270     if (RHS->isLiveIn())
1271       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1272 
1273     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1274         getOperand(1)->isDefinedOutsideVectorRegions())
1275       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1276     Type *VectorTy =
1277         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1278     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1279 
1280     SmallVector<const Value *, 4> Operands;
1281     if (CtxI)
1282       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1283     return Ctx.TTI.getArithmeticInstrCost(
1284         Opcode, VectorTy, CostKind,
1285         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1286         RHSInfo, Operands, CtxI, &Ctx.TLI);
1287   }
1288   case Instruction::Freeze: {
1289     // This opcode is unknown. Assume that it is the same as 'mul'.
1290     Type *VectorTy =
1291         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1292     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1293   }
1294   case Instruction::ICmp:
1295   case Instruction::FCmp: {
1296     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1297     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1298     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1299                                       CostKind, CtxI);
1300   }
1301   default:
1302     llvm_unreachable("Unsupported opcode for instruction");
1303   }
1304 }
1305 
1306 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1307   unsigned Opcode = getOpcode();
1308   // TODO: Support other opcodes
1309   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1310     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1311 
1312   State.setDebugLocFrom(getDebugLoc());
1313   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1314                           "explicit vector length.");
1315 
1316   assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
1317          "VPWidenEVLRecipe should not be used for scalars");
1318 
1319   VPValue *EVL = getEVL();
1320   Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
1321   IRBuilderBase &BuilderIR = State.Builder;
1322   VectorBuilder Builder(BuilderIR);
1323   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1324 
1325   SmallVector<Value *, 4> Ops;
1326   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1327     VPValue *VPOp = getOperand(I);
1328     Ops.push_back(State.get(VPOp, 0));
1329   }
1330 
1331   Builder.setMask(Mask).setEVL(EVLArg);
1332   Value *VPInst =
1333       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1334   // Currently vp-intrinsics only accept FMF flags.
1335   // TODO: Enable other flags when support is added.
1336   if (isa<FPMathOperator>(VPInst))
1337     setFlags(cast<Instruction>(VPInst));
1338 
1339   State.set(this, VPInst, 0);
1340   State.addMetadata(VPInst,
1341                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1342 }
1343 
1344 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1345 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1346                           VPSlotTracker &SlotTracker) const {
1347   O << Indent << "WIDEN ";
1348   printAsOperand(O, SlotTracker);
1349   O << " = " << Instruction::getOpcodeName(Opcode);
1350   printFlags(O);
1351   printOperands(O, SlotTracker);
1352 }
1353 
1354 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1355                              VPSlotTracker &SlotTracker) const {
1356   O << Indent << "WIDEN-VP ";
1357   printAsOperand(O, SlotTracker);
1358   O << " = " << Instruction::getOpcodeName(getOpcode());
1359   printFlags(O);
1360   printOperands(O, SlotTracker);
1361 }
1362 #endif
1363 
1364 void VPWidenCastRecipe::execute(VPTransformState &State) {
1365   State.setDebugLocFrom(getDebugLoc());
1366   auto &Builder = State.Builder;
1367   /// Vectorize casts.
1368   assert(State.VF.isVector() && "Not vectorizing?");
1369   Type *DestTy = VectorType::get(getResultType(), State.VF);
1370   VPValue *Op = getOperand(0);
1371   for (unsigned Part = 0; Part < State.UF; ++Part) {
1372     if (Part > 0 && Op->isLiveIn()) {
1373       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1374       State.set(this, State.get(this, 0), Part);
1375       continue;
1376     }
1377     Value *A = State.get(Op, Part);
1378     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1379     State.set(this, Cast, Part);
1380     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1381   }
1382 }
1383 
1384 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1385 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1386                               VPSlotTracker &SlotTracker) const {
1387   O << Indent << "WIDEN-CAST ";
1388   printAsOperand(O, SlotTracker);
1389   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1390   printFlags(O);
1391   printOperands(O, SlotTracker);
1392   O << " to " << *getResultType();
1393 }
1394 #endif
1395 
1396 /// This function adds
1397 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1398 /// to each vector element of Val. The sequence starts at StartIndex.
1399 /// \p Opcode is relevant for FP induction variable.
1400 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1401                             Instruction::BinaryOps BinOp, ElementCount VF,
1402                             IRBuilderBase &Builder) {
1403   assert(VF.isVector() && "only vector VFs are supported");
1404 
1405   // Create and check the types.
1406   auto *ValVTy = cast<VectorType>(Val->getType());
1407   ElementCount VLen = ValVTy->getElementCount();
1408 
1409   Type *STy = Val->getType()->getScalarType();
1410   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1411          "Induction Step must be an integer or FP");
1412   assert(Step->getType() == STy && "Step has wrong type");
1413 
1414   SmallVector<Constant *, 8> Indices;
1415 
1416   // Create a vector of consecutive numbers from zero to VF.
1417   VectorType *InitVecValVTy = ValVTy;
1418   if (STy->isFloatingPointTy()) {
1419     Type *InitVecValSTy =
1420         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1421     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1422   }
1423   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1424 
1425   // Splat the StartIdx
1426   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1427 
1428   if (STy->isIntegerTy()) {
1429     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1430     Step = Builder.CreateVectorSplat(VLen, Step);
1431     assert(Step->getType() == Val->getType() && "Invalid step vec");
1432     // FIXME: The newly created binary instructions should contain nsw/nuw
1433     // flags, which can be found from the original scalar operations.
1434     Step = Builder.CreateMul(InitVec, Step);
1435     return Builder.CreateAdd(Val, Step, "induction");
1436   }
1437 
1438   // Floating point induction.
1439   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1440          "Binary Opcode should be specified for FP induction");
1441   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1442   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1443 
1444   Step = Builder.CreateVectorSplat(VLen, Step);
1445   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1446   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1447 }
1448 
1449 /// A helper function that returns an integer or floating-point constant with
1450 /// value C.
1451 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1452   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1453                            : ConstantFP::get(Ty, C);
1454 }
1455 
1456 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1457   assert(!State.Instance && "Int or FP induction being replicated.");
1458 
1459   Value *Start = getStartValue()->getLiveInIRValue();
1460   const InductionDescriptor &ID = getInductionDescriptor();
1461   TruncInst *Trunc = getTruncInst();
1462   IRBuilderBase &Builder = State.Builder;
1463   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1464   assert(State.VF.isVector() && "must have vector VF");
1465 
1466   // The value from the original loop to which we are mapping the new induction
1467   // variable.
1468   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1469 
1470   // Fast-math-flags propagate from the original induction instruction.
1471   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1472   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1473     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1474 
1475   // Now do the actual transformations, and start with fetching the step value.
1476   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1477 
1478   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1479          "Expected either an induction phi-node or a truncate of it!");
1480 
1481   // Construct the initial value of the vector IV in the vector loop preheader
1482   auto CurrIP = Builder.saveIP();
1483   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1484   Builder.SetInsertPoint(VectorPH->getTerminator());
1485   if (isa<TruncInst>(EntryVal)) {
1486     assert(Start->getType()->isIntegerTy() &&
1487            "Truncation requires an integer type");
1488     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1489     Step = Builder.CreateTrunc(Step, TruncType);
1490     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1491   }
1492 
1493   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1494   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1495   Value *SteppedStart = getStepVector(
1496       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1497 
1498   // We create vector phi nodes for both integer and floating-point induction
1499   // variables. Here, we determine the kind of arithmetic we will perform.
1500   Instruction::BinaryOps AddOp;
1501   Instruction::BinaryOps MulOp;
1502   if (Step->getType()->isIntegerTy()) {
1503     AddOp = Instruction::Add;
1504     MulOp = Instruction::Mul;
1505   } else {
1506     AddOp = ID.getInductionOpcode();
1507     MulOp = Instruction::FMul;
1508   }
1509 
1510   // Multiply the vectorization factor by the step using integer or
1511   // floating-point arithmetic as appropriate.
1512   Type *StepType = Step->getType();
1513   Value *RuntimeVF = State.get(getVFValue(), {0, 0});
1514   if (Step->getType()->isFloatingPointTy())
1515     RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1516   else
1517     RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1518   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1519 
1520   // Create a vector splat to use in the induction update.
1521   //
1522   // FIXME: If the step is non-constant, we create the vector splat with
1523   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1524   //        handle a constant vector splat.
1525   Value *SplatVF = isa<Constant>(Mul)
1526                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1527                        : Builder.CreateVectorSplat(State.VF, Mul);
1528   Builder.restoreIP(CurrIP);
1529 
1530   // We may need to add the step a number of times, depending on the unroll
1531   // factor. The last of those goes into the PHI.
1532   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1533   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1534   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1535   Instruction *LastInduction = VecInd;
1536   for (unsigned Part = 0; Part < State.UF; ++Part) {
1537     State.set(this, LastInduction, Part);
1538 
1539     if (isa<TruncInst>(EntryVal))
1540       State.addMetadata(LastInduction, EntryVal);
1541 
1542     LastInduction = cast<Instruction>(
1543         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1544     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1545   }
1546 
1547   LastInduction->setName("vec.ind.next");
1548   VecInd->addIncoming(SteppedStart, VectorPH);
1549   // Add induction update using an incorrect block temporarily. The phi node
1550   // will be fixed after VPlan execution. Note that at this point the latch
1551   // block cannot be used, as it does not exist yet.
1552   // TODO: Model increment value in VPlan, by turning the recipe into a
1553   // multi-def and a subclass of VPHeaderPHIRecipe.
1554   VecInd->addIncoming(LastInduction, VectorPH);
1555 }
1556 
1557 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1558 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1559                                           VPSlotTracker &SlotTracker) const {
1560   O << Indent << "WIDEN-INDUCTION";
1561   if (getTruncInst()) {
1562     O << "\\l\"";
1563     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1564     O << " +\n" << Indent << "\"  ";
1565     getVPValue(0)->printAsOperand(O, SlotTracker);
1566   } else
1567     O << " " << VPlanIngredient(IV);
1568 
1569   O << ", ";
1570   getStepValue()->printAsOperand(O, SlotTracker);
1571 
1572   O << ", ";
1573   getVFValue()->printAsOperand(O, SlotTracker);
1574 }
1575 #endif
1576 
1577 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1578   // The step may be defined by a recipe in the preheader (e.g. if it requires
1579   // SCEV expansion), but for the canonical induction the step is required to be
1580   // 1, which is represented as live-in.
1581   if (getStepValue()->getDefiningRecipe())
1582     return false;
1583   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1584   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1585   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1586   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1587          getScalarType() == CanIV->getScalarType();
1588 }
1589 
1590 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1591 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1592                               VPSlotTracker &SlotTracker) const {
1593   O << Indent;
1594   printAsOperand(O, SlotTracker);
1595   O << " = DERIVED-IV ";
1596   getStartValue()->printAsOperand(O, SlotTracker);
1597   O << " + ";
1598   getOperand(1)->printAsOperand(O, SlotTracker);
1599   O << " * ";
1600   getStepValue()->printAsOperand(O, SlotTracker);
1601 }
1602 #endif
1603 
1604 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1605   // Fast-math-flags propagate from the original induction instruction.
1606   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1607   if (hasFastMathFlags())
1608     State.Builder.setFastMathFlags(getFastMathFlags());
1609 
1610   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1611   /// variable on which to base the steps, \p Step is the size of the step.
1612 
1613   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1614   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1615   IRBuilderBase &Builder = State.Builder;
1616 
1617   // Ensure step has the same type as that of scalar IV.
1618   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1619   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1620 
1621   // We build scalar steps for both integer and floating-point induction
1622   // variables. Here, we determine the kind of arithmetic we will perform.
1623   Instruction::BinaryOps AddOp;
1624   Instruction::BinaryOps MulOp;
1625   if (BaseIVTy->isIntegerTy()) {
1626     AddOp = Instruction::Add;
1627     MulOp = Instruction::Mul;
1628   } else {
1629     AddOp = InductionOpcode;
1630     MulOp = Instruction::FMul;
1631   }
1632 
1633   // Determine the number of scalars we need to generate for each unroll
1634   // iteration.
1635   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1636   // Compute the scalar steps and save the results in State.
1637   Type *IntStepTy =
1638       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1639   Type *VecIVTy = nullptr;
1640   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1641   if (!FirstLaneOnly && State.VF.isScalable()) {
1642     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1643     UnitStepVec =
1644         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1645     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1646     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1647   }
1648 
1649   unsigned StartPart = 0;
1650   unsigned EndPart = State.UF;
1651   unsigned StartLane = 0;
1652   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1653   if (State.Instance) {
1654     StartPart = State.Instance->Part;
1655     EndPart = StartPart + 1;
1656     StartLane = State.Instance->Lane.getKnownLane();
1657     EndLane = StartLane + 1;
1658   }
1659   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1660     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1661 
1662     if (!FirstLaneOnly && State.VF.isScalable()) {
1663       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1664       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1665       if (BaseIVTy->isFloatingPointTy())
1666         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1667       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1668       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1669       State.set(this, Add, Part);
1670       // It's useful to record the lane values too for the known minimum number
1671       // of elements so we do those below. This improves the code quality when
1672       // trying to extract the first element, for example.
1673     }
1674 
1675     if (BaseIVTy->isFloatingPointTy())
1676       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1677 
1678     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1679       Value *StartIdx = Builder.CreateBinOp(
1680           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1681       // The step returned by `createStepForVF` is a runtime-evaluated value
1682       // when VF is scalable. Otherwise, it should be folded into a Constant.
1683       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1684              "Expected StartIdx to be folded to a constant when VF is not "
1685              "scalable");
1686       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1687       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1688       State.set(this, Add, VPIteration(Part, Lane));
1689     }
1690   }
1691 }
1692 
1693 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1694 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1695                                   VPSlotTracker &SlotTracker) const {
1696   O << Indent;
1697   printAsOperand(O, SlotTracker);
1698   O << " = SCALAR-STEPS ";
1699   printOperands(O, SlotTracker);
1700 }
1701 #endif
1702 
1703 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1704   assert(State.VF.isVector() && "not widening");
1705   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1706   // Construct a vector GEP by widening the operands of the scalar GEP as
1707   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1708   // results in a vector of pointers when at least one operand of the GEP
1709   // is vector-typed. Thus, to keep the representation compact, we only use
1710   // vector-typed operands for loop-varying values.
1711 
1712   if (areAllOperandsInvariant()) {
1713     // If we are vectorizing, but the GEP has only loop-invariant operands,
1714     // the GEP we build (by only using vector-typed operands for
1715     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1716     // produce a vector of pointers, we need to either arbitrarily pick an
1717     // operand to broadcast, or broadcast a clone of the original GEP.
1718     // Here, we broadcast a clone of the original.
1719     //
1720     // TODO: If at some point we decide to scalarize instructions having
1721     //       loop-invariant operands, this special case will no longer be
1722     //       required. We would add the scalarization decision to
1723     //       collectLoopScalars() and teach getVectorValue() to broadcast
1724     //       the lane-zero scalar value.
1725     SmallVector<Value *> Ops;
1726     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1727       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1728 
1729     auto *NewGEP =
1730         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1731                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1732     for (unsigned Part = 0; Part < State.UF; ++Part) {
1733       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1734       State.set(this, EntryPart, Part);
1735       State.addMetadata(EntryPart, GEP);
1736     }
1737   } else {
1738     // If the GEP has at least one loop-varying operand, we are sure to
1739     // produce a vector of pointers. But if we are only unrolling, we want
1740     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1741     // produce with the code below will be scalar (if VF == 1) or vector
1742     // (otherwise). Note that for the unroll-only case, we still maintain
1743     // values in the vector mapping with initVector, as we do for other
1744     // instructions.
1745     for (unsigned Part = 0; Part < State.UF; ++Part) {
1746       // The pointer operand of the new GEP. If it's loop-invariant, we
1747       // won't broadcast it.
1748       auto *Ptr = isPointerLoopInvariant()
1749                       ? State.get(getOperand(0), VPIteration(0, 0))
1750                       : State.get(getOperand(0), Part);
1751 
1752       // Collect all the indices for the new GEP. If any index is
1753       // loop-invariant, we won't broadcast it.
1754       SmallVector<Value *, 4> Indices;
1755       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1756         VPValue *Operand = getOperand(I);
1757         if (isIndexLoopInvariant(I - 1))
1758           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1759         else
1760           Indices.push_back(State.get(Operand, Part));
1761       }
1762 
1763       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1764       // but it should be a vector, otherwise.
1765       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1766                                              Indices, "", isInBounds());
1767       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1768              "NewGEP is not a pointer vector");
1769       State.set(this, NewGEP, Part);
1770       State.addMetadata(NewGEP, GEP);
1771     }
1772   }
1773 }
1774 
1775 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1776 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1777                              VPSlotTracker &SlotTracker) const {
1778   O << Indent << "WIDEN-GEP ";
1779   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1780   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1781     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1782 
1783   O << " ";
1784   printAsOperand(O, SlotTracker);
1785   O << " = getelementptr";
1786   printFlags(O);
1787   printOperands(O, SlotTracker);
1788 }
1789 #endif
1790 
1791 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1792   auto &Builder = State.Builder;
1793   State.setDebugLocFrom(getDebugLoc());
1794   for (unsigned Part = 0; Part < State.UF; ++Part) {
1795     // Calculate the pointer for the specific unroll-part.
1796     Value *PartPtr = nullptr;
1797     // Use i32 for the gep index type when the value is constant,
1798     // or query DataLayout for a more suitable index type otherwise.
1799     const DataLayout &DL =
1800         Builder.GetInsertBlock()->getDataLayout();
1801     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1802                         ? DL.getIndexType(IndexedTy->getPointerTo())
1803                         : Builder.getInt32Ty();
1804     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1805     bool InBounds = isInBounds();
1806     if (IsReverse) {
1807       // If the address is consecutive but reversed, then the
1808       // wide store needs to start at the last vector element.
1809       // RunTimeVF =  VScale * VF.getKnownMinValue()
1810       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1811       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1812       // NumElt = -Part * RunTimeVF
1813       Value *NumElt = Builder.CreateMul(
1814           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1815       // LastLane = 1 - RunTimeVF
1816       Value *LastLane =
1817           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1818       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1819       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1820     } else {
1821       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1822       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1823     }
1824 
1825     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1826   }
1827 }
1828 
1829 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1830 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1831                                   VPSlotTracker &SlotTracker) const {
1832   O << Indent;
1833   printAsOperand(O, SlotTracker);
1834   O << " = vector-pointer ";
1835   if (IsReverse)
1836     O << "(reverse) ";
1837 
1838   printOperands(O, SlotTracker);
1839 }
1840 #endif
1841 
1842 void VPBlendRecipe::execute(VPTransformState &State) {
1843   assert(isNormalized() && "Expected blend to be normalized!");
1844   State.setDebugLocFrom(getDebugLoc());
1845   // We know that all PHIs in non-header blocks are converted into
1846   // selects, so we don't have to worry about the insertion order and we
1847   // can just use the builder.
1848   // At this point we generate the predication tree. There may be
1849   // duplications since this is a simple recursive scan, but future
1850   // optimizations will clean it up.
1851 
1852   unsigned NumIncoming = getNumIncomingValues();
1853 
1854   // Generate a sequence of selects of the form:
1855   // SELECT(Mask3, In3,
1856   //        SELECT(Mask2, In2,
1857   //               SELECT(Mask1, In1,
1858   //                      In0)))
1859   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1860   // are essentially undef are taken from In0.
1861   VectorParts Entry(State.UF);
1862   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1863   for (unsigned In = 0; In < NumIncoming; ++In) {
1864     for (unsigned Part = 0; Part < State.UF; ++Part) {
1865       // We might have single edge PHIs (blocks) - use an identity
1866       // 'select' for the first PHI operand.
1867       Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1868       if (In == 0)
1869         Entry[Part] = In0; // Initialize with the first incoming value.
1870       else {
1871         // Select between the current value and the previous incoming edge
1872         // based on the incoming mask.
1873         Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1874         Entry[Part] =
1875             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1876       }
1877     }
1878   }
1879 
1880   for (unsigned Part = 0; Part < State.UF; ++Part)
1881     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1882 }
1883 
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1886                           VPSlotTracker &SlotTracker) const {
1887   O << Indent << "BLEND ";
1888   printAsOperand(O, SlotTracker);
1889   O << " =";
1890   if (getNumIncomingValues() == 1) {
1891     // Not a User of any mask: not really blending, this is a
1892     // single-predecessor phi.
1893     O << " ";
1894     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1895   } else {
1896     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1897       O << " ";
1898       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1899       if (I == 0)
1900         continue;
1901       O << "/";
1902       getMask(I)->printAsOperand(O, SlotTracker);
1903     }
1904   }
1905 }
1906 #endif
1907 
1908 void VPReductionRecipe::execute(VPTransformState &State) {
1909   assert(!State.Instance && "Reduction being replicated.");
1910   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1911   RecurKind Kind = RdxDesc.getRecurrenceKind();
1912   // Propagate the fast-math flags carried by the underlying instruction.
1913   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1914   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1915   for (unsigned Part = 0; Part < State.UF; ++Part) {
1916     Value *NewVecOp = State.get(getVecOp(), Part);
1917     if (VPValue *Cond = getCondOp()) {
1918       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1919       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1920       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1921 
1922       Value *Start;
1923       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1924         Start = RdxDesc.getRecurrenceStartValue();
1925       else
1926         Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1927                                             RdxDesc.getFastMathFlags());
1928       if (State.VF.isVector())
1929         Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(),
1930                                                 Start);
1931 
1932       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1933       NewVecOp = Select;
1934     }
1935     Value *NewRed;
1936     Value *NextInChain;
1937     if (IsOrdered) {
1938       if (State.VF.isVector())
1939         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1940                                         PrevInChain);
1941       else
1942         NewRed = State.Builder.CreateBinOp(
1943             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1944             NewVecOp);
1945       PrevInChain = NewRed;
1946       NextInChain = NewRed;
1947     } else {
1948       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1949       NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1950       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1951         NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1952                                      NewRed, PrevInChain);
1953       else
1954         NextInChain = State.Builder.CreateBinOp(
1955             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
1956             PrevInChain);
1957     }
1958     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1959   }
1960 }
1961 
1962 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1963   assert(!State.Instance && "Reduction being replicated.");
1964   assert(State.UF == 1 &&
1965          "Expected only UF == 1 when vectorizing with explicit vector length.");
1966 
1967   auto &Builder = State.Builder;
1968   // Propagate the fast-math flags carried by the underlying instruction.
1969   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1970   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1971   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1972 
1973   RecurKind Kind = RdxDesc.getRecurrenceKind();
1974   Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1975   Value *VecOp = State.get(getVecOp(), 0);
1976   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1977 
1978   VectorBuilder VBuilder(Builder);
1979   VBuilder.setEVL(EVL);
1980   Value *Mask;
1981   // TODO: move the all-true mask generation into VectorBuilder.
1982   if (VPValue *CondOp = getCondOp())
1983     Mask = State.get(CondOp, 0);
1984   else
1985     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1986   VBuilder.setMask(Mask);
1987 
1988   Value *NewRed;
1989   if (isOrdered()) {
1990     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1991   } else {
1992     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
1993     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1994       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1995     else
1996       NewRed = Builder.CreateBinOp(
1997           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1998   }
1999   State.set(this, NewRed, 0, /*IsScalar*/ true);
2000 }
2001 
2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2003 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2004                               VPSlotTracker &SlotTracker) const {
2005   O << Indent << "REDUCE ";
2006   printAsOperand(O, SlotTracker);
2007   O << " = ";
2008   getChainOp()->printAsOperand(O, SlotTracker);
2009   O << " +";
2010   if (isa<FPMathOperator>(getUnderlyingInstr()))
2011     O << getUnderlyingInstr()->getFastMathFlags();
2012   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2013   getVecOp()->printAsOperand(O, SlotTracker);
2014   if (isConditional()) {
2015     O << ", ";
2016     getCondOp()->printAsOperand(O, SlotTracker);
2017   }
2018   O << ")";
2019   if (RdxDesc.IntermediateStore)
2020     O << " (with final reduction value stored in invariant address sank "
2021          "outside of loop)";
2022 }
2023 
2024 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2025                                  VPSlotTracker &SlotTracker) const {
2026   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2027   O << Indent << "REDUCE ";
2028   printAsOperand(O, SlotTracker);
2029   O << " = ";
2030   getChainOp()->printAsOperand(O, SlotTracker);
2031   O << " +";
2032   if (isa<FPMathOperator>(getUnderlyingInstr()))
2033     O << getUnderlyingInstr()->getFastMathFlags();
2034   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2035   getVecOp()->printAsOperand(O, SlotTracker);
2036   O << ", ";
2037   getEVL()->printAsOperand(O, SlotTracker);
2038   if (isConditional()) {
2039     O << ", ";
2040     getCondOp()->printAsOperand(O, SlotTracker);
2041   }
2042   O << ")";
2043   if (RdxDesc.IntermediateStore)
2044     O << " (with final reduction value stored in invariant address sank "
2045          "outside of loop)";
2046 }
2047 #endif
2048 
2049 bool VPReplicateRecipe::shouldPack() const {
2050   // Find if the recipe is used by a widened recipe via an intervening
2051   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2052   return any_of(users(), [](const VPUser *U) {
2053     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2054       return any_of(PredR->users(), [PredR](const VPUser *U) {
2055         return !U->usesScalars(PredR);
2056       });
2057     return false;
2058   });
2059 }
2060 
2061 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2062 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2063                               VPSlotTracker &SlotTracker) const {
2064   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2065 
2066   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2067     printAsOperand(O, SlotTracker);
2068     O << " = ";
2069   }
2070   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2071     O << "call";
2072     printFlags(O);
2073     O << "@" << CB->getCalledFunction()->getName() << "(";
2074     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2075                     O, [&O, &SlotTracker](VPValue *Op) {
2076                       Op->printAsOperand(O, SlotTracker);
2077                     });
2078     O << ")";
2079   } else {
2080     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2081     printFlags(O);
2082     printOperands(O, SlotTracker);
2083   }
2084 
2085   if (shouldPack())
2086     O << " (S->V)";
2087 }
2088 #endif
2089 
2090 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
2091 /// if it is either defined outside the vector region or its operand is known to
2092 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
2093 /// TODO: Uniformity should be associated with a VPValue and there should be a
2094 /// generic way to check.
2095 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
2096   return C->isDefinedOutsideVectorRegions() ||
2097          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
2098          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
2099 }
2100 
2101 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
2102   assert(vputils::onlyFirstLaneUsed(this) &&
2103          "Codegen only implemented for first lane.");
2104   switch (Opcode) {
2105   case Instruction::SExt:
2106   case Instruction::ZExt:
2107   case Instruction::Trunc: {
2108     // Note: SExt/ZExt not used yet.
2109     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
2110     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2111   }
2112   default:
2113     llvm_unreachable("opcode not implemented yet");
2114   }
2115 }
2116 
2117 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2118   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
2119   for (unsigned Part = 0; Part != State.UF; ++Part) {
2120     Value *Res;
2121     // Only generate a single instance, if the recipe is uniform across UFs and
2122     // VFs.
2123     if (Part > 0 && IsUniformAcrossVFsAndUFs)
2124       Res = State.get(this, VPIteration(0, 0));
2125     else
2126       Res = generate(State, Part);
2127     State.set(this, Res, VPIteration(Part, 0));
2128   }
2129 }
2130 
2131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2132 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2133                                 VPSlotTracker &SlotTracker) const {
2134   O << Indent << "SCALAR-CAST ";
2135   printAsOperand(O, SlotTracker);
2136   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2137   printOperands(O, SlotTracker);
2138   O << " to " << *ResultTy;
2139 }
2140 #endif
2141 
2142 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2143   assert(State.Instance && "Branch on Mask works only on single instance.");
2144 
2145   unsigned Part = State.Instance->Part;
2146   unsigned Lane = State.Instance->Lane.getKnownLane();
2147 
2148   Value *ConditionBit = nullptr;
2149   VPValue *BlockInMask = getMask();
2150   if (BlockInMask) {
2151     ConditionBit = State.get(BlockInMask, Part);
2152     if (ConditionBit->getType()->isVectorTy())
2153       ConditionBit = State.Builder.CreateExtractElement(
2154           ConditionBit, State.Builder.getInt32(Lane));
2155   } else // Block in mask is all-one.
2156     ConditionBit = State.Builder.getTrue();
2157 
2158   // Replace the temporary unreachable terminator with a new conditional branch,
2159   // whose two destinations will be set later when they are created.
2160   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2161   assert(isa<UnreachableInst>(CurrentTerminator) &&
2162          "Expected to replace unreachable terminator with conditional branch.");
2163   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2164   CondBr->setSuccessor(0, nullptr);
2165   ReplaceInstWithInst(CurrentTerminator, CondBr);
2166 }
2167 
2168 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2169   assert(State.Instance && "Predicated instruction PHI works per instance.");
2170   Instruction *ScalarPredInst =
2171       cast<Instruction>(State.get(getOperand(0), *State.Instance));
2172   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2173   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2174   assert(PredicatingBB && "Predicated block has no single predecessor.");
2175   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2176          "operand must be VPReplicateRecipe");
2177 
2178   // By current pack/unpack logic we need to generate only a single phi node: if
2179   // a vector value for the predicated instruction exists at this point it means
2180   // the instruction has vector users only, and a phi for the vector value is
2181   // needed. In this case the recipe of the predicated instruction is marked to
2182   // also do that packing, thereby "hoisting" the insert-element sequence.
2183   // Otherwise, a phi node for the scalar value is needed.
2184   unsigned Part = State.Instance->Part;
2185   if (State.hasVectorValue(getOperand(0), Part)) {
2186     Value *VectorValue = State.get(getOperand(0), Part);
2187     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2188     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2189     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2190     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2191     if (State.hasVectorValue(this, Part))
2192       State.reset(this, VPhi, Part);
2193     else
2194       State.set(this, VPhi, Part);
2195     // NOTE: Currently we need to update the value of the operand, so the next
2196     // predicated iteration inserts its generated value in the correct vector.
2197     State.reset(getOperand(0), VPhi, Part);
2198   } else {
2199     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2200     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2201     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2202                      PredicatingBB);
2203     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2204     if (State.hasScalarValue(this, *State.Instance))
2205       State.reset(this, Phi, *State.Instance);
2206     else
2207       State.set(this, Phi, *State.Instance);
2208     // NOTE: Currently we need to update the value of the operand, so the next
2209     // predicated iteration inserts its generated value in the correct vector.
2210     State.reset(getOperand(0), Phi, *State.Instance);
2211   }
2212 }
2213 
2214 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2215 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2216                                 VPSlotTracker &SlotTracker) const {
2217   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2218   printAsOperand(O, SlotTracker);
2219   O << " = ";
2220   printOperands(O, SlotTracker);
2221 }
2222 #endif
2223 
2224 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2225                                                  VPCostContext &Ctx) const {
2226   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2227   const Align Alignment =
2228       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2229   unsigned AS =
2230       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2231   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2232 
2233   if (!Consecutive) {
2234     // TODO: Using the original IR may not be accurate.
2235     // Currently, ARM will use the underlying IR to calculate gather/scatter
2236     // instruction cost.
2237     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2238     assert(!Reverse &&
2239            "Inconsecutive memory access should not have the order.");
2240     return Ctx.TTI.getAddressComputationCost(Ty) +
2241            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2242                                           IsMasked, Alignment, CostKind,
2243                                           &Ingredient);
2244   }
2245 
2246   InstructionCost Cost = 0;
2247   if (IsMasked) {
2248     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2249                                           AS, CostKind);
2250   } else {
2251     TTI::OperandValueInfo OpInfo =
2252         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2253     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2254                                     CostKind, OpInfo, &Ingredient);
2255   }
2256   if (!Reverse)
2257     return Cost;
2258 
2259   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2260                                         cast<VectorType>(Ty), std::nullopt,
2261                                         CostKind, 0);
2262 }
2263 
2264 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2265   auto *LI = cast<LoadInst>(&Ingredient);
2266 
2267   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2268   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2269   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2270   bool CreateGather = !isConsecutive();
2271 
2272   auto &Builder = State.Builder;
2273   State.setDebugLocFrom(getDebugLoc());
2274   for (unsigned Part = 0; Part < State.UF; ++Part) {
2275     Value *NewLI;
2276     Value *Mask = nullptr;
2277     if (auto *VPMask = getMask()) {
2278       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2279       // of a null all-one mask is a null mask.
2280       Mask = State.get(VPMask, Part);
2281       if (isReverse())
2282         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2283     }
2284 
2285     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
2286     if (CreateGather) {
2287       NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2288                                          "wide.masked.gather");
2289     } else if (Mask) {
2290       NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2291                                        PoisonValue::get(DataTy),
2292                                        "wide.masked.load");
2293     } else {
2294       NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2295     }
2296     // Add metadata to the load, but setVectorValue to the reverse shuffle.
2297     State.addMetadata(NewLI, LI);
2298     if (Reverse)
2299       NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2300     State.set(this, NewLI, Part);
2301   }
2302 }
2303 
2304 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2305 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2306                               VPSlotTracker &SlotTracker) const {
2307   O << Indent << "WIDEN ";
2308   printAsOperand(O, SlotTracker);
2309   O << " = load ";
2310   printOperands(O, SlotTracker);
2311 }
2312 #endif
2313 
2314 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2315 /// dependence w/o affecting the result.
2316 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2317                                      Value *EVL, const Twine &Name) {
2318   VectorType *ValTy = cast<VectorType>(Operand->getType());
2319   Value *AllTrueMask =
2320       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2321   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2322                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2323 }
2324 
2325 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2326   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2327                           "explicit vector length.");
2328   auto *LI = cast<LoadInst>(&Ingredient);
2329 
2330   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2331   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2332   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2333   bool CreateGather = !isConsecutive();
2334 
2335   auto &Builder = State.Builder;
2336   State.setDebugLocFrom(getDebugLoc());
2337   CallInst *NewLI;
2338   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2339   Value *Addr = State.get(getAddr(), 0, !CreateGather);
2340   Value *Mask = nullptr;
2341   if (VPValue *VPMask = getMask()) {
2342     Mask = State.get(VPMask, 0);
2343     if (isReverse())
2344       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2345   } else {
2346     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2347   }
2348 
2349   if (CreateGather) {
2350     NewLI =
2351         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2352                                 nullptr, "wide.masked.gather");
2353   } else {
2354     VectorBuilder VBuilder(Builder);
2355     VBuilder.setEVL(EVL).setMask(Mask);
2356     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2357         Instruction::Load, DataTy, Addr, "vp.op.load"));
2358   }
2359   NewLI->addParamAttr(
2360       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2361   State.addMetadata(NewLI, LI);
2362   Instruction *Res = NewLI;
2363   if (isReverse())
2364     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2365   State.set(this, Res, 0);
2366 }
2367 
2368 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2369 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2370                                  VPSlotTracker &SlotTracker) const {
2371   O << Indent << "WIDEN ";
2372   printAsOperand(O, SlotTracker);
2373   O << " = vp.load ";
2374   printOperands(O, SlotTracker);
2375 }
2376 #endif
2377 
2378 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2379   auto *SI = cast<StoreInst>(&Ingredient);
2380 
2381   VPValue *StoredVPValue = getStoredValue();
2382   bool CreateScatter = !isConsecutive();
2383   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2384 
2385   auto &Builder = State.Builder;
2386   State.setDebugLocFrom(getDebugLoc());
2387 
2388   for (unsigned Part = 0; Part < State.UF; ++Part) {
2389     Instruction *NewSI = nullptr;
2390     Value *Mask = nullptr;
2391     if (auto *VPMask = getMask()) {
2392       // Mask reversal is only needed for non-all-one (null) masks, as reverse
2393       // of a null all-one mask is a null mask.
2394       Mask = State.get(VPMask, Part);
2395       if (isReverse())
2396         Mask = Builder.CreateVectorReverse(Mask, "reverse");
2397     }
2398 
2399     Value *StoredVal = State.get(StoredVPValue, Part);
2400     if (isReverse()) {
2401       // If we store to reverse consecutive memory locations, then we need
2402       // to reverse the order of elements in the stored value.
2403       StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2404       // We don't want to update the value in the map as it might be used in
2405       // another expression. So don't call resetVectorValue(StoredVal).
2406     }
2407     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
2408     if (CreateScatter)
2409       NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2410     else if (Mask)
2411       NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2412     else
2413       NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2414     State.addMetadata(NewSI, SI);
2415   }
2416 }
2417 
2418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2419 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2420                                VPSlotTracker &SlotTracker) const {
2421   O << Indent << "WIDEN store ";
2422   printOperands(O, SlotTracker);
2423 }
2424 #endif
2425 
2426 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2427   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
2428                           "explicit vector length.");
2429   auto *SI = cast<StoreInst>(&Ingredient);
2430 
2431   VPValue *StoredValue = getStoredValue();
2432   bool CreateScatter = !isConsecutive();
2433   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2434 
2435   auto &Builder = State.Builder;
2436   State.setDebugLocFrom(getDebugLoc());
2437 
2438   CallInst *NewSI = nullptr;
2439   Value *StoredVal = State.get(StoredValue, 0);
2440   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
2441   if (isReverse())
2442     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2443   Value *Mask = nullptr;
2444   if (VPValue *VPMask = getMask()) {
2445     Mask = State.get(VPMask, 0);
2446     if (isReverse())
2447       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2448   } else {
2449     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2450   }
2451   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
2452   if (CreateScatter) {
2453     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2454                                     Intrinsic::vp_scatter,
2455                                     {StoredVal, Addr, Mask, EVL});
2456   } else {
2457     VectorBuilder VBuilder(Builder);
2458     VBuilder.setEVL(EVL).setMask(Mask);
2459     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2460         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2461         {StoredVal, Addr}));
2462   }
2463   NewSI->addParamAttr(
2464       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2465   State.addMetadata(NewSI, SI);
2466 }
2467 
2468 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2469 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2470                                   VPSlotTracker &SlotTracker) const {
2471   O << Indent << "WIDEN vp.store ";
2472   printOperands(O, SlotTracker);
2473 }
2474 #endif
2475 
2476 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2477                                      VectorType *DstVTy, const DataLayout &DL) {
2478   // Verify that V is a vector type with same number of elements as DstVTy.
2479   auto VF = DstVTy->getElementCount();
2480   auto *SrcVecTy = cast<VectorType>(V->getType());
2481   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2482   Type *SrcElemTy = SrcVecTy->getElementType();
2483   Type *DstElemTy = DstVTy->getElementType();
2484   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2485          "Vector elements must have same size");
2486 
2487   // Do a direct cast if element types are castable.
2488   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2489     return Builder.CreateBitOrPointerCast(V, DstVTy);
2490   }
2491   // V cannot be directly casted to desired vector type.
2492   // May happen when V is a floating point vector but DstVTy is a vector of
2493   // pointers or vice-versa. Handle this using a two-step bitcast using an
2494   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2495   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2496          "Only one type should be a pointer type");
2497   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2498          "Only one type should be a floating point type");
2499   Type *IntTy =
2500       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2501   auto *VecIntTy = VectorType::get(IntTy, VF);
2502   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2503   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2504 }
2505 
2506 /// Return a vector containing interleaved elements from multiple
2507 /// smaller input vectors.
2508 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2509                                 const Twine &Name) {
2510   unsigned Factor = Vals.size();
2511   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2512 
2513   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2514 #ifndef NDEBUG
2515   for (Value *Val : Vals)
2516     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2517 #endif
2518 
2519   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2520   // must use intrinsics to interleave.
2521   if (VecTy->isScalableTy()) {
2522     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2523     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2524                                    Vals,
2525                                    /*FMFSource=*/nullptr, Name);
2526   }
2527 
2528   // Fixed length. Start by concatenating all vectors into a wide vector.
2529   Value *WideVec = concatenateVectors(Builder, Vals);
2530 
2531   // Interleave the elements into the wide vector.
2532   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2533   return Builder.CreateShuffleVector(
2534       WideVec, createInterleaveMask(NumElts, Factor), Name);
2535 }
2536 
2537 // Try to vectorize the interleave group that \p Instr belongs to.
2538 //
2539 // E.g. Translate following interleaved load group (factor = 3):
2540 //   for (i = 0; i < N; i+=3) {
2541 //     R = Pic[i];             // Member of index 0
2542 //     G = Pic[i+1];           // Member of index 1
2543 //     B = Pic[i+2];           // Member of index 2
2544 //     ... // do something to R, G, B
2545 //   }
2546 // To:
2547 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2548 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2549 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2550 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2551 //
2552 // Or translate following interleaved store group (factor = 3):
2553 //   for (i = 0; i < N; i+=3) {
2554 //     ... do something to R, G, B
2555 //     Pic[i]   = R;           // Member of index 0
2556 //     Pic[i+1] = G;           // Member of index 1
2557 //     Pic[i+2] = B;           // Member of index 2
2558 //   }
2559 // To:
2560 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2561 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2562 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2563 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2564 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2565 void VPInterleaveRecipe::execute(VPTransformState &State) {
2566   assert(!State.Instance && "Interleave group being replicated.");
2567   const InterleaveGroup<Instruction> *Group = IG;
2568   Instruction *Instr = Group->getInsertPos();
2569 
2570   // Prepare for the vector type of the interleaved load/store.
2571   Type *ScalarTy = getLoadStoreType(Instr);
2572   unsigned InterleaveFactor = Group->getFactor();
2573   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2574 
2575   // Prepare for the new pointers.
2576   SmallVector<Value *, 2> AddrParts;
2577   unsigned Index = Group->getIndex(Instr);
2578 
2579   // TODO: extend the masked interleaved-group support to reversed access.
2580   VPValue *BlockInMask = getMask();
2581   assert((!BlockInMask || !Group->isReverse()) &&
2582          "Reversed masked interleave-group not supported.");
2583 
2584   Value *Idx;
2585   // If the group is reverse, adjust the index to refer to the last vector lane
2586   // instead of the first. We adjust the index from the first vector lane,
2587   // rather than directly getting the pointer for lane VF - 1, because the
2588   // pointer operand of the interleaved access is supposed to be uniform. For
2589   // uniform instructions, we're only required to generate a value for the
2590   // first vector lane in each unroll iteration.
2591   if (Group->isReverse()) {
2592     Value *RuntimeVF =
2593         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2594     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2595     Idx = State.Builder.CreateMul(Idx,
2596                                   State.Builder.getInt32(Group->getFactor()));
2597     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2598     Idx = State.Builder.CreateNeg(Idx);
2599   } else
2600     Idx = State.Builder.getInt32(-Index);
2601 
2602   VPValue *Addr = getAddr();
2603   for (unsigned Part = 0; Part < State.UF; Part++) {
2604     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2605     if (auto *I = dyn_cast<Instruction>(AddrPart))
2606       State.setDebugLocFrom(I->getDebugLoc());
2607 
2608     // Notice current instruction could be any index. Need to adjust the address
2609     // to the member of index 0.
2610     //
2611     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2612     //       b = A[i];       // Member of index 0
2613     // Current pointer is pointed to A[i+1], adjust it to A[i].
2614     //
2615     // E.g.  A[i+1] = a;     // Member of index 1
2616     //       A[i]   = b;     // Member of index 0
2617     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2618     // Current pointer is pointed to A[i+2], adjust it to A[i].
2619 
2620     bool InBounds = false;
2621     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2622       InBounds = gep->isInBounds();
2623     AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2624     AddrParts.push_back(AddrPart);
2625   }
2626 
2627   State.setDebugLocFrom(Instr->getDebugLoc());
2628   Value *PoisonVec = PoisonValue::get(VecTy);
2629 
2630   auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2631                              unsigned Part, Value *MaskForGaps) -> Value * {
2632     if (State.VF.isScalable()) {
2633       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2634       assert(InterleaveFactor == 2 &&
2635              "Unsupported deinterleave factor for scalable vectors");
2636       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2637       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2638       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2639                                      State.VF.getKnownMinValue() * 2, true);
2640       return State.Builder.CreateIntrinsic(
2641           MaskTy, Intrinsic::vector_interleave2, Ops,
2642           /*FMFSource=*/nullptr, "interleaved.mask");
2643     }
2644 
2645     if (!BlockInMask)
2646       return MaskForGaps;
2647 
2648     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2649     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2650         BlockInMaskPart,
2651         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2652         "interleaved.mask");
2653     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2654                                                    ShuffledMask, MaskForGaps)
2655                        : ShuffledMask;
2656   };
2657 
2658   const DataLayout &DL = Instr->getDataLayout();
2659   // Vectorize the interleaved load group.
2660   if (isa<LoadInst>(Instr)) {
2661     Value *MaskForGaps = nullptr;
2662     if (NeedsMaskForGaps) {
2663       MaskForGaps = createBitMaskForGaps(State.Builder,
2664                                          State.VF.getKnownMinValue(), *Group);
2665       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2666     }
2667 
2668     // For each unroll part, create a wide load for the group.
2669     SmallVector<Value *, 2> NewLoads;
2670     for (unsigned Part = 0; Part < State.UF; Part++) {
2671       Instruction *NewLoad;
2672       if (BlockInMask || MaskForGaps) {
2673         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2674         NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2675                                                  Group->getAlign(), GroupMask,
2676                                                  PoisonVec, "wide.masked.vec");
2677       } else
2678         NewLoad = State.Builder.CreateAlignedLoad(
2679             VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2680       Group->addMetadata(NewLoad);
2681       NewLoads.push_back(NewLoad);
2682     }
2683 
2684     ArrayRef<VPValue *> VPDefs = definedValues();
2685     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2686     if (VecTy->isScalableTy()) {
2687       assert(InterleaveFactor == 2 &&
2688              "Unsupported deinterleave factor for scalable vectors");
2689 
2690       for (unsigned Part = 0; Part < State.UF; ++Part) {
2691         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2692         // so must use intrinsics to deinterleave.
2693         Value *DI = State.Builder.CreateIntrinsic(
2694             Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2695             /*FMFSource=*/nullptr, "strided.vec");
2696         unsigned J = 0;
2697         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2698           Instruction *Member = Group->getMember(I);
2699 
2700           if (!Member)
2701             continue;
2702 
2703           Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2704           // If this member has different type, cast the result type.
2705           if (Member->getType() != ScalarTy) {
2706             VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2707             StridedVec =
2708                 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2709           }
2710 
2711           if (Group->isReverse())
2712             StridedVec =
2713                 State.Builder.CreateVectorReverse(StridedVec, "reverse");
2714 
2715           State.set(VPDefs[J], StridedVec, Part);
2716           ++J;
2717         }
2718       }
2719 
2720       return;
2721     }
2722 
2723     // For each member in the group, shuffle out the appropriate data from the
2724     // wide loads.
2725     unsigned J = 0;
2726     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2727       Instruction *Member = Group->getMember(I);
2728 
2729       // Skip the gaps in the group.
2730       if (!Member)
2731         continue;
2732 
2733       auto StrideMask =
2734           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2735       for (unsigned Part = 0; Part < State.UF; Part++) {
2736         Value *StridedVec = State.Builder.CreateShuffleVector(
2737             NewLoads[Part], StrideMask, "strided.vec");
2738 
2739         // If this member has different type, cast the result type.
2740         if (Member->getType() != ScalarTy) {
2741           assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2742           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2743           StridedVec =
2744               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2745         }
2746 
2747         if (Group->isReverse())
2748           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2749 
2750         State.set(VPDefs[J], StridedVec, Part);
2751       }
2752       ++J;
2753     }
2754     return;
2755   }
2756 
2757   // The sub vector type for current instruction.
2758   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2759 
2760   // Vectorize the interleaved store group.
2761   Value *MaskForGaps =
2762       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2763   assert((!MaskForGaps || !State.VF.isScalable()) &&
2764          "masking gaps for scalable vectors is not yet supported.");
2765   ArrayRef<VPValue *> StoredValues = getStoredValues();
2766   for (unsigned Part = 0; Part < State.UF; Part++) {
2767     // Collect the stored vector from each member.
2768     SmallVector<Value *, 4> StoredVecs;
2769     unsigned StoredIdx = 0;
2770     for (unsigned i = 0; i < InterleaveFactor; i++) {
2771       assert((Group->getMember(i) || MaskForGaps) &&
2772              "Fail to get a member from an interleaved store group");
2773       Instruction *Member = Group->getMember(i);
2774 
2775       // Skip the gaps in the group.
2776       if (!Member) {
2777         Value *Undef = PoisonValue::get(SubVT);
2778         StoredVecs.push_back(Undef);
2779         continue;
2780       }
2781 
2782       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2783       ++StoredIdx;
2784 
2785       if (Group->isReverse())
2786         StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2787 
2788       // If this member has different type, cast it to a unified type.
2789 
2790       if (StoredVec->getType() != SubVT)
2791         StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2792 
2793       StoredVecs.push_back(StoredVec);
2794     }
2795 
2796     // Interleave all the smaller vectors into one wider vector.
2797     Value *IVec =
2798         interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2799     Instruction *NewStoreInstr;
2800     if (BlockInMask || MaskForGaps) {
2801       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2802       NewStoreInstr = State.Builder.CreateMaskedStore(
2803           IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2804     } else
2805       NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2806                                                        Group->getAlign());
2807 
2808     Group->addMetadata(NewStoreInstr);
2809   }
2810 }
2811 
2812 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2813 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2814                                VPSlotTracker &SlotTracker) const {
2815   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2816   IG->getInsertPos()->printAsOperand(O, false);
2817   O << ", ";
2818   getAddr()->printAsOperand(O, SlotTracker);
2819   VPValue *Mask = getMask();
2820   if (Mask) {
2821     O << ", ";
2822     Mask->printAsOperand(O, SlotTracker);
2823   }
2824 
2825   unsigned OpIdx = 0;
2826   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2827     if (!IG->getMember(i))
2828       continue;
2829     if (getNumStoreOperands() > 0) {
2830       O << "\n" << Indent << "  store ";
2831       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2832       O << " to index " << i;
2833     } else {
2834       O << "\n" << Indent << "  ";
2835       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2836       O << " = load from index " << i;
2837     }
2838     ++OpIdx;
2839   }
2840 }
2841 #endif
2842 
2843 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2844   Value *Start = getStartValue()->getLiveInIRValue();
2845   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2846   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2847 
2848   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2849   EntryPart->addIncoming(Start, VectorPH);
2850   EntryPart->setDebugLoc(getDebugLoc());
2851   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2852     State.set(this, EntryPart, Part, /*IsScalar*/ true);
2853 }
2854 
2855 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2856 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2857                                    VPSlotTracker &SlotTracker) const {
2858   O << Indent << "EMIT ";
2859   printAsOperand(O, SlotTracker);
2860   O << " = CANONICAL-INDUCTION ";
2861   printOperands(O, SlotTracker);
2862 }
2863 #endif
2864 
2865 bool VPCanonicalIVPHIRecipe::isCanonical(
2866     InductionDescriptor::InductionKind Kind, VPValue *Start,
2867     VPValue *Step) const {
2868   // Must be an integer induction.
2869   if (Kind != InductionDescriptor::IK_IntInduction)
2870     return false;
2871   // Start must match the start value of this canonical induction.
2872   if (Start != getStartValue())
2873     return false;
2874 
2875   // If the step is defined by a recipe, it is not a ConstantInt.
2876   if (Step->getDefiningRecipe())
2877     return false;
2878 
2879   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2880   return StepC && StepC->isOne();
2881 }
2882 
2883 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2884   return IsScalarAfterVectorization &&
2885          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2886 }
2887 
2888 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2889   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2890          "Not a pointer induction according to InductionDescriptor!");
2891   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2892          "Unexpected type.");
2893   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2894          "Recipe should have been replaced");
2895 
2896   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2897   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
2898   Type *PhiType = IndDesc.getStep()->getType();
2899 
2900   // Build a pointer phi
2901   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2902   Type *ScStValueType = ScalarStartValue->getType();
2903   PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2904                                            CanonicalIV->getIterator());
2905 
2906   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2907   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2908 
2909   // A pointer induction, performed by using a gep
2910   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2911 
2912   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
2913   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2914   Value *NumUnrolledElems =
2915       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
2916   Value *InductionGEP = GetElementPtrInst::Create(
2917       State.Builder.getInt8Ty(), NewPointerPhi,
2918       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2919       InductionLoc);
2920   // Add induction update using an incorrect block temporarily. The phi node
2921   // will be fixed after VPlan execution. Note that at this point the latch
2922   // block cannot be used, as it does not exist yet.
2923   // TODO: Model increment value in VPlan, by turning the recipe into a
2924   // multi-def and a subclass of VPHeaderPHIRecipe.
2925   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2926 
2927   // Create UF many actual address geps that use the pointer
2928   // phi as base and a vectorized version of the step value
2929   // (<step*0, ..., step*N>) as offset.
2930   for (unsigned Part = 0; Part < State.UF; ++Part) {
2931     Type *VecPhiType = VectorType::get(PhiType, State.VF);
2932     Value *StartOffsetScalar =
2933         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
2934     Value *StartOffset =
2935         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2936     // Create a vector of consecutive numbers from zero to VF.
2937     StartOffset = State.Builder.CreateAdd(
2938         StartOffset, State.Builder.CreateStepVector(VecPhiType));
2939 
2940     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
2941            "scalar step must be the same across all parts");
2942     Value *GEP = State.Builder.CreateGEP(
2943         State.Builder.getInt8Ty(), NewPointerPhi,
2944         State.Builder.CreateMul(
2945             StartOffset,
2946             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
2947             "vector.gep"));
2948     State.set(this, GEP, Part);
2949   }
2950 }
2951 
2952 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2953 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2954                                           VPSlotTracker &SlotTracker) const {
2955   O << Indent << "EMIT ";
2956   printAsOperand(O, SlotTracker);
2957   O << " = WIDEN-POINTER-INDUCTION ";
2958   getStartValue()->printAsOperand(O, SlotTracker);
2959   O << ", " << *IndDesc.getStep();
2960 }
2961 #endif
2962 
2963 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2964   assert(!State.Instance && "cannot be used in per-lane");
2965   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2966   SCEVExpander Exp(SE, DL, "induction");
2967 
2968   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2969                                  &*State.Builder.GetInsertPoint());
2970   assert(!State.ExpandedSCEVs.contains(Expr) &&
2971          "Same SCEV expanded multiple times");
2972   State.ExpandedSCEVs[Expr] = Res;
2973   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2974     State.set(this, Res, {Part, 0});
2975 }
2976 
2977 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2978 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2979                                VPSlotTracker &SlotTracker) const {
2980   O << Indent << "EMIT ";
2981   getVPSingleValue()->printAsOperand(O, SlotTracker);
2982   O << " = EXPAND SCEV " << *Expr;
2983 }
2984 #endif
2985 
2986 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2987   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2988   Type *STy = CanonicalIV->getType();
2989   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2990   ElementCount VF = State.VF;
2991   Value *VStart = VF.isScalar()
2992                       ? CanonicalIV
2993                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2994   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2995     Value *VStep = createStepForVF(Builder, STy, VF, Part);
2996     if (VF.isVector()) {
2997       VStep = Builder.CreateVectorSplat(VF, VStep);
2998       VStep =
2999           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3000     }
3001     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3002     State.set(this, CanonicalVectorIV, Part);
3003   }
3004 }
3005 
3006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3007 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3008                                      VPSlotTracker &SlotTracker) const {
3009   O << Indent << "EMIT ";
3010   printAsOperand(O, SlotTracker);
3011   O << " = WIDEN-CANONICAL-INDUCTION ";
3012   printOperands(O, SlotTracker);
3013 }
3014 #endif
3015 
3016 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3017   auto &Builder = State.Builder;
3018   // Create a vector from the initial value.
3019   auto *VectorInit = getStartValue()->getLiveInIRValue();
3020 
3021   Type *VecTy = State.VF.isScalar()
3022                     ? VectorInit->getType()
3023                     : VectorType::get(VectorInit->getType(), State.VF);
3024 
3025   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3026   if (State.VF.isVector()) {
3027     auto *IdxTy = Builder.getInt32Ty();
3028     auto *One = ConstantInt::get(IdxTy, 1);
3029     IRBuilder<>::InsertPointGuard Guard(Builder);
3030     Builder.SetInsertPoint(VectorPH->getTerminator());
3031     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3032     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3033     VectorInit = Builder.CreateInsertElement(
3034         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3035   }
3036 
3037   // Create a phi node for the new recurrence.
3038   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
3039   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3040   EntryPart->addIncoming(VectorInit, VectorPH);
3041   State.set(this, EntryPart, 0);
3042 }
3043 
3044 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3045 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3046                                             VPSlotTracker &SlotTracker) const {
3047   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3048   printAsOperand(O, SlotTracker);
3049   O << " = phi ";
3050   printOperands(O, SlotTracker);
3051 }
3052 #endif
3053 
3054 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3055   auto &Builder = State.Builder;
3056 
3057   // Reductions do not have to start at zero. They can start with
3058   // any loop invariant values.
3059   VPValue *StartVPV = getStartValue();
3060   Value *StartV = StartVPV->getLiveInIRValue();
3061 
3062   // In order to support recurrences we need to be able to vectorize Phi nodes.
3063   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3064   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3065   // this value when we vectorize all of the instructions that use the PHI.
3066   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3067   Type *VecTy = ScalarPHI ? StartV->getType()
3068                           : VectorType::get(StartV->getType(), State.VF);
3069 
3070   BasicBlock *HeaderBB = State.CFG.PrevBB;
3071   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3072          "recipe must be in the vector loop header");
3073   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
3074   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3075     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
3076     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
3077     State.set(this, EntryPart, Part, IsInLoop);
3078   }
3079 
3080   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3081 
3082   Value *Iden = nullptr;
3083   RecurKind RK = RdxDesc.getRecurrenceKind();
3084   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3085       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3086     // MinMax and AnyOf reductions have the start value as their identity.
3087     if (ScalarPHI) {
3088       Iden = StartV;
3089     } else {
3090       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3091       Builder.SetInsertPoint(VectorPH->getTerminator());
3092       StartV = Iden =
3093           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
3094     }
3095   } else {
3096     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3097                                        RdxDesc.getFastMathFlags());
3098 
3099     if (!ScalarPHI) {
3100       Iden = Builder.CreateVectorSplat(State.VF, Iden);
3101       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3102       Builder.SetInsertPoint(VectorPH->getTerminator());
3103       Constant *Zero = Builder.getInt32(0);
3104       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3105     }
3106   }
3107 
3108   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
3109     Value *EntryPart = State.get(this, Part, IsInLoop);
3110     // Make sure to add the reduction start value only to the
3111     // first unroll part.
3112     Value *StartVal = (Part == 0) ? StartV : Iden;
3113     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
3114   }
3115 }
3116 
3117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3118 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3119                                  VPSlotTracker &SlotTracker) const {
3120   O << Indent << "WIDEN-REDUCTION-PHI ";
3121 
3122   printAsOperand(O, SlotTracker);
3123   O << " = phi ";
3124   printOperands(O, SlotTracker);
3125 }
3126 #endif
3127 
3128 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3129   assert(EnableVPlanNativePath &&
3130          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3131 
3132   Value *Op0 = State.get(getOperand(0), 0);
3133   Type *VecTy = Op0->getType();
3134   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3135   State.set(this, VecPhi, 0);
3136 }
3137 
3138 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3139 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3140                              VPSlotTracker &SlotTracker) const {
3141   O << Indent << "WIDEN-PHI ";
3142 
3143   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3144   // Unless all incoming values are modeled in VPlan  print the original PHI
3145   // directly.
3146   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3147   // values as VPValues.
3148   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3149     O << VPlanIngredient(OriginalPhi);
3150     return;
3151   }
3152 
3153   printAsOperand(O, SlotTracker);
3154   O << " = phi ";
3155   printOperands(O, SlotTracker);
3156 }
3157 #endif
3158 
3159 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3160 // remove VPActiveLaneMaskPHIRecipe.
3161 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3162   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3163   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
3164     Value *StartMask = State.get(getOperand(0), Part);
3165     PHINode *EntryPart =
3166         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3167     EntryPart->addIncoming(StartMask, VectorPH);
3168     EntryPart->setDebugLoc(getDebugLoc());
3169     State.set(this, EntryPart, Part);
3170   }
3171 }
3172 
3173 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3174 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3175                                       VPSlotTracker &SlotTracker) const {
3176   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3177 
3178   printAsOperand(O, SlotTracker);
3179   O << " = phi ";
3180   printOperands(O, SlotTracker);
3181 }
3182 #endif
3183 
3184 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3185   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3186   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
3187   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
3188   PHINode *EntryPart =
3189       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3190   EntryPart->addIncoming(Start, VectorPH);
3191   EntryPart->setDebugLoc(getDebugLoc());
3192   State.set(this, EntryPart, 0, /*IsScalar=*/true);
3193 }
3194 
3195 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3196 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3197                                   VPSlotTracker &SlotTracker) const {
3198   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3199 
3200   printAsOperand(O, SlotTracker);
3201   O << " = phi ";
3202   printOperands(O, SlotTracker);
3203 }
3204 #endif
3205