xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 0344123ffb47c88c896f5d2207cd53491a9823ed)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/Type.h"
27 #include "llvm/IR/Value.h"
28 #include "llvm/IR/VectorBuilder.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
34 #include "llvm/Transforms/Utils/LoopUtils.h"
35 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
36 #include <cassert>
37 
38 using namespace llvm;
39 
40 using VectorParts = SmallVector<Value *, 2>;
41 
42 namespace llvm {
43 extern cl::opt<bool> EnableVPlanNativePath;
44 }
45 extern cl::opt<unsigned> ForceTargetInstructionCost;
46 
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
49 
50 bool VPRecipeBase::mayWriteToMemory() const {
51   switch (getVPDefID()) {
52   case VPInstructionSC:
53     if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
54       return false;
55     switch (cast<VPInstruction>(this)->getOpcode()) {
56     case Instruction::Or:
57     case Instruction::ICmp:
58     case Instruction::Select:
59     case VPInstruction::Not:
60     case VPInstruction::CalculateTripCountMinusVF:
61     case VPInstruction::CanonicalIVIncrementForPart:
62     case VPInstruction::ExtractFromEnd:
63     case VPInstruction::FirstOrderRecurrenceSplice:
64     case VPInstruction::LogicalAnd:
65     case VPInstruction::PtrAdd:
66       return false;
67     default:
68       return true;
69     }
70   case VPInterleaveSC:
71     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
72   case VPWidenStoreEVLSC:
73   case VPWidenStoreSC:
74     return true;
75   case VPReplicateSC:
76     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
77         ->mayWriteToMemory();
78   case VPWidenCallSC:
79     return !cast<VPWidenCallRecipe>(this)
80                 ->getCalledScalarFunction()
81                 ->onlyReadsMemory();
82   case VPBranchOnMaskSC:
83   case VPScalarIVStepsSC:
84   case VPPredInstPHISC:
85     return false;
86   case VPBlendSC:
87   case VPReductionEVLSC:
88   case VPReductionSC:
89   case VPWidenCanonicalIVSC:
90   case VPWidenCastSC:
91   case VPWidenGEPSC:
92   case VPWidenIntOrFpInductionSC:
93   case VPWidenLoadEVLSC:
94   case VPWidenLoadSC:
95   case VPWidenPHISC:
96   case VPWidenSC:
97   case VPWidenEVLSC:
98   case VPWidenSelectSC: {
99     const Instruction *I =
100         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101     (void)I;
102     assert((!I || !I->mayWriteToMemory()) &&
103            "underlying instruction may write to memory");
104     return false;
105   }
106   default:
107     return true;
108   }
109 }
110 
111 bool VPRecipeBase::mayReadFromMemory() const {
112   switch (getVPDefID()) {
113   case VPWidenLoadEVLSC:
114   case VPWidenLoadSC:
115     return true;
116   case VPReplicateSC:
117     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
118         ->mayReadFromMemory();
119   case VPWidenCallSC:
120     return !cast<VPWidenCallRecipe>(this)
121                 ->getCalledScalarFunction()
122                 ->onlyWritesMemory();
123   case VPBranchOnMaskSC:
124   case VPPredInstPHISC:
125   case VPScalarIVStepsSC:
126   case VPWidenStoreEVLSC:
127   case VPWidenStoreSC:
128     return false;
129   case VPBlendSC:
130   case VPReductionEVLSC:
131   case VPReductionSC:
132   case VPWidenCanonicalIVSC:
133   case VPWidenCastSC:
134   case VPWidenGEPSC:
135   case VPWidenIntOrFpInductionSC:
136   case VPWidenPHISC:
137   case VPWidenSC:
138   case VPWidenEVLSC:
139   case VPWidenSelectSC: {
140     const Instruction *I =
141         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
142     (void)I;
143     assert((!I || !I->mayReadFromMemory()) &&
144            "underlying instruction may read from memory");
145     return false;
146   }
147   default:
148     return true;
149   }
150 }
151 
152 bool VPRecipeBase::mayHaveSideEffects() const {
153   switch (getVPDefID()) {
154   case VPDerivedIVSC:
155   case VPPredInstPHISC:
156   case VPScalarCastSC:
157     return false;
158   case VPInstructionSC:
159     return mayWriteToMemory();
160   case VPWidenCallSC: {
161     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
162     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
163   }
164   case VPBlendSC:
165   case VPReductionEVLSC:
166   case VPReductionSC:
167   case VPScalarIVStepsSC:
168   case VPWidenCanonicalIVSC:
169   case VPWidenCastSC:
170   case VPWidenGEPSC:
171   case VPWidenIntOrFpInductionSC:
172   case VPWidenPHISC:
173   case VPWidenPointerInductionSC:
174   case VPWidenSC:
175   case VPWidenEVLSC:
176   case VPWidenSelectSC: {
177     const Instruction *I =
178         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
179     (void)I;
180     assert((!I || !I->mayHaveSideEffects()) &&
181            "underlying instruction has side-effects");
182     return false;
183   }
184   case VPInterleaveSC:
185     return mayWriteToMemory();
186   case VPWidenLoadEVLSC:
187   case VPWidenLoadSC:
188   case VPWidenStoreEVLSC:
189   case VPWidenStoreSC:
190     assert(
191         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
192             mayWriteToMemory() &&
193         "mayHaveSideffects result for ingredient differs from this "
194         "implementation");
195     return mayWriteToMemory();
196   case VPReplicateSC: {
197     auto *R = cast<VPReplicateRecipe>(this);
198     return R->getUnderlyingInstr()->mayHaveSideEffects();
199   }
200   default:
201     return true;
202   }
203 }
204 
205 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
206   VPValue *ExitValue = getOperand(0);
207   VPBasicBlock *MiddleVPBB =
208       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
209   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
210   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
211   // Values leaving the vector loop reach live out phi's in the exiting block
212   // via middle block.
213   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
214                        ? MiddleVPBB
215                        : ExitingVPBB;
216   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
217   Value *V = State.get(ExitValue, VPLane(0));
218   if (Phi->getBasicBlockIndex(PredBB) != -1)
219     Phi->setIncomingValueForBlock(PredBB, V);
220   else
221     Phi->addIncoming(V, PredBB);
222 }
223 
224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
226   O << "Live-out ";
227   getPhi()->printAsOperand(O);
228   O << " = ";
229   getOperand(0)->printAsOperand(O, SlotTracker);
230   O << "\n";
231 }
232 #endif
233 
234 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
235   assert(!Parent && "Recipe already in some VPBasicBlock");
236   assert(InsertPos->getParent() &&
237          "Insertion position not in any VPBasicBlock");
238   InsertPos->getParent()->insert(this, InsertPos->getIterator());
239 }
240 
241 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
242                                 iplist<VPRecipeBase>::iterator I) {
243   assert(!Parent && "Recipe already in some VPBasicBlock");
244   assert(I == BB.end() || I->getParent() == &BB);
245   BB.insert(this, I);
246 }
247 
248 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
249   assert(!Parent && "Recipe already in some VPBasicBlock");
250   assert(InsertPos->getParent() &&
251          "Insertion position not in any VPBasicBlock");
252   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
253 }
254 
255 void VPRecipeBase::removeFromParent() {
256   assert(getParent() && "Recipe not in any VPBasicBlock");
257   getParent()->getRecipeList().remove(getIterator());
258   Parent = nullptr;
259 }
260 
261 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
262   assert(getParent() && "Recipe not in any VPBasicBlock");
263   return getParent()->getRecipeList().erase(getIterator());
264 }
265 
266 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
267   removeFromParent();
268   insertAfter(InsertPos);
269 }
270 
271 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
272                               iplist<VPRecipeBase>::iterator I) {
273   removeFromParent();
274   insertBefore(BB, I);
275 }
276 
277 /// Return the underlying instruction to be used for computing \p R's cost via
278 /// the legacy cost model. Return nullptr if there's no suitable instruction.
279 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
280   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
281     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
282   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
283     return IG->getInsertPos();
284   // Currently the legacy cost model only calculates the instruction cost with
285   // underlying instruction. Removing the WidenMem here will prevent
286   // force-target-instruction-cost overwriting the cost of recipe with
287   // underlying instruction which is inconsistent with the legacy model.
288   // TODO: Remove WidenMem from this function when we don't need to compare to
289   // the legacy model.
290   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
291     return &WidenMem->getIngredient();
292   return nullptr;
293 }
294 
295 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
296   auto *UI = getInstructionForCost(this);
297   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
298     return 0;
299 
300   InstructionCost RecipeCost = computeCost(VF, Ctx);
301   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
302       RecipeCost.isValid())
303     RecipeCost = InstructionCost(ForceTargetInstructionCost);
304 
305   LLVM_DEBUG({
306     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
307     dump();
308   });
309   return RecipeCost;
310 }
311 
312 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
313                                           VPCostContext &Ctx) const {
314   // Compute the cost for the recipe falling back to the legacy cost model using
315   // the underlying instruction. If there is no underlying instruction, returns
316   // 0.
317   Instruction *UI = getInstructionForCost(this);
318   if (UI && isa<VPReplicateRecipe>(this)) {
319     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
320     // transform, avoid computing their cost multiple times for now.
321     Ctx.SkipCostComputation.insert(UI);
322   }
323   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
324 }
325 
326 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
327   assert(OpType == OperationType::FPMathOp &&
328          "recipe doesn't have fast math flags");
329   FastMathFlags Res;
330   Res.setAllowReassoc(FMFs.AllowReassoc);
331   Res.setNoNaNs(FMFs.NoNaNs);
332   Res.setNoInfs(FMFs.NoInfs);
333   Res.setNoSignedZeros(FMFs.NoSignedZeros);
334   Res.setAllowReciprocal(FMFs.AllowReciprocal);
335   Res.setAllowContract(FMFs.AllowContract);
336   Res.setApproxFunc(FMFs.ApproxFunc);
337   return Res;
338 }
339 
340 template <unsigned PartOpIdx>
341 VPValue *
342 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
343   if (U.getNumOperands() == PartOpIdx + 1)
344     return U.getOperand(PartOpIdx);
345   return nullptr;
346 }
347 
348 template <unsigned PartOpIdx>
349 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
350   if (auto *UnrollPartOp = getUnrollPartOperand(U))
351     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
352   return 0;
353 }
354 
355 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
356                              VPValue *A, VPValue *B, DebugLoc DL,
357                              const Twine &Name)
358     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
359                           Pred, DL),
360       Opcode(Opcode), Name(Name.str()) {
361   assert(Opcode == Instruction::ICmp &&
362          "only ICmp predicates supported at the moment");
363 }
364 
365 VPInstruction::VPInstruction(unsigned Opcode,
366                              std::initializer_list<VPValue *> Operands,
367                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
368     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
369       Opcode(Opcode), Name(Name.str()) {
370   // Make sure the VPInstruction is a floating-point operation.
371   assert(isFPMathOp() && "this op can't take fast-math flags");
372 }
373 
374 bool VPInstruction::doesGeneratePerAllLanes() const {
375   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
376 }
377 
378 bool VPInstruction::canGenerateScalarForFirstLane() const {
379   if (Instruction::isBinaryOp(getOpcode()))
380     return true;
381   if (isSingleScalar() || isVectorToScalar())
382     return true;
383   switch (Opcode) {
384   case Instruction::ICmp:
385   case VPInstruction::BranchOnCond:
386   case VPInstruction::BranchOnCount:
387   case VPInstruction::CalculateTripCountMinusVF:
388   case VPInstruction::CanonicalIVIncrementForPart:
389   case VPInstruction::PtrAdd:
390   case VPInstruction::ExplicitVectorLength:
391     return true;
392   default:
393     return false;
394   }
395 }
396 
397 Value *VPInstruction::generatePerLane(VPTransformState &State,
398                                       const VPLane &Lane) {
399   IRBuilderBase &Builder = State.Builder;
400 
401   assert(getOpcode() == VPInstruction::PtrAdd &&
402          "only PtrAdd opcodes are supported for now");
403   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
404                               State.get(getOperand(1), Lane), Name);
405 }
406 
407 Value *VPInstruction::generate(VPTransformState &State) {
408   IRBuilderBase &Builder = State.Builder;
409 
410   if (Instruction::isBinaryOp(getOpcode())) {
411     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
412     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
413     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
414     auto *Res =
415         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
416     if (auto *I = dyn_cast<Instruction>(Res))
417       setFlags(I);
418     return Res;
419   }
420 
421   switch (getOpcode()) {
422   case VPInstruction::Not: {
423     Value *A = State.get(getOperand(0));
424     return Builder.CreateNot(A, Name);
425   }
426   case Instruction::ICmp: {
427     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
428     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
429     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
430     return Builder.CreateCmp(getPredicate(), A, B, Name);
431   }
432   case Instruction::Select: {
433     Value *Cond = State.get(getOperand(0));
434     Value *Op1 = State.get(getOperand(1));
435     Value *Op2 = State.get(getOperand(2));
436     return Builder.CreateSelect(Cond, Op1, Op2, Name);
437   }
438   case VPInstruction::ActiveLaneMask: {
439     // Get first lane of vector induction variable.
440     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
441     // Get the original loop tripcount.
442     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
443 
444     // If this part of the active lane mask is scalar, generate the CMP directly
445     // to avoid unnecessary extracts.
446     if (State.VF.isScalar())
447       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
448                                Name);
449 
450     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
451     auto *PredTy = VectorType::get(Int1Ty, State.VF);
452     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
453                                    {PredTy, ScalarTC->getType()},
454                                    {VIVElem0, ScalarTC}, nullptr, Name);
455   }
456   case VPInstruction::FirstOrderRecurrenceSplice: {
457     // Generate code to combine the previous and current values in vector v3.
458     //
459     //   vector.ph:
460     //     v_init = vector(..., ..., ..., a[-1])
461     //     br vector.body
462     //
463     //   vector.body
464     //     i = phi [0, vector.ph], [i+4, vector.body]
465     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
466     //     v2 = a[i, i+1, i+2, i+3];
467     //     v3 = vector(v1(3), v2(0, 1, 2))
468 
469     auto *V1 = State.get(getOperand(0));
470     if (!V1->getType()->isVectorTy())
471       return V1;
472     Value *V2 = State.get(getOperand(1));
473     return Builder.CreateVectorSplice(V1, V2, -1, Name);
474   }
475   case VPInstruction::CalculateTripCountMinusVF: {
476     unsigned UF = getParent()->getPlan()->getUF();
477     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
478     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
479     Value *Sub = Builder.CreateSub(ScalarTC, Step);
480     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
481     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
482     return Builder.CreateSelect(Cmp, Sub, Zero);
483   }
484   case VPInstruction::ExplicitVectorLength: {
485     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
486     // be outside of the main loop.
487     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
488     // Compute EVL
489     assert(AVL->getType()->isIntegerTy() &&
490            "Requested vector length should be an integer.");
491 
492     assert(State.VF.isScalable() && "Expected scalable vector factor.");
493     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
494 
495     Value *EVL = State.Builder.CreateIntrinsic(
496         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
497         {AVL, VFArg, State.Builder.getTrue()});
498     return EVL;
499   }
500   case VPInstruction::CanonicalIVIncrementForPart: {
501     unsigned Part = getUnrollPart(*this);
502     auto *IV = State.get(getOperand(0), VPLane(0));
503     assert(Part != 0 && "Must have a positive part");
504     // The canonical IV is incremented by the vectorization factor (num of
505     // SIMD elements) times the unroll part.
506     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
507     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
508                              hasNoSignedWrap());
509   }
510   case VPInstruction::BranchOnCond: {
511     Value *Cond = State.get(getOperand(0), VPLane(0));
512     // Replace the temporary unreachable terminator with a new conditional
513     // branch, hooking it up to backward destination for exiting blocks now and
514     // to forward destination(s) later when they are created.
515     BranchInst *CondBr =
516         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
517     CondBr->setSuccessor(0, nullptr);
518     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
519 
520     if (!getParent()->isExiting())
521       return CondBr;
522 
523     VPRegionBlock *ParentRegion = getParent()->getParent();
524     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
525     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
526     return CondBr;
527   }
528   case VPInstruction::BranchOnCount: {
529     // First create the compare.
530     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
531     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
532     Value *Cond = Builder.CreateICmpEQ(IV, TC);
533 
534     // Now create the branch.
535     auto *Plan = getParent()->getPlan();
536     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
537     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
538 
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination (the header) now and to the
541     // forward destination (the exit/middle block) later when it is created.
542     // Note that CreateCondBr expects a valid BB as first argument, so we need
543     // to set it to nullptr later.
544     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
545                                               State.CFG.VPBB2IRBB[Header]);
546     CondBr->setSuccessor(0, nullptr);
547     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
548     return CondBr;
549   }
550   case VPInstruction::ComputeReductionResult: {
551     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
552     // and will be removed by breaking up the recipe further.
553     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
554     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
555     // Get its reduction variable descriptor.
556     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
557 
558     RecurKind RK = RdxDesc.getRecurrenceKind();
559 
560     Type *PhiTy = OrigPhi->getType();
561     // The recipe's operands are the reduction phi, followed by one operand for
562     // each part of the reduction.
563     unsigned UF = getNumOperands() - 1;
564     VectorParts RdxParts(UF);
565     for (unsigned Part = 0; Part < UF; ++Part)
566       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
567 
568     // If the vector reduction can be performed in a smaller type, we truncate
569     // then extend the loop exit value to enable InstCombine to evaluate the
570     // entire expression in the smaller type.
571     // TODO: Handle this in truncateToMinBW.
572     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
573       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
574       for (unsigned Part = 0; Part < UF; ++Part)
575         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
576     }
577     // Reduce all of the unrolled parts into a single vector.
578     Value *ReducedPartRdx = RdxParts[0];
579     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
580     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
581       Op = Instruction::Or;
582 
583     if (PhiR->isOrdered()) {
584       ReducedPartRdx = RdxParts[UF - 1];
585     } else {
586       // Floating-point operations should have some FMF to enable the reduction.
587       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
588       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
589       for (unsigned Part = 1; Part < UF; ++Part) {
590         Value *RdxPart = RdxParts[Part];
591         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
592           ReducedPartRdx = Builder.CreateBinOp(
593               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
594         else
595           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
596       }
597     }
598 
599     // Create the reduction after the loop. Note that inloop reductions create
600     // the target reduction in the loop using a Reduction recipe.
601     if ((State.VF.isVector() ||
602          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
603         !PhiR->isInLoop()) {
604       ReducedPartRdx =
605           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
606       // If the reduction can be performed in a smaller type, we need to extend
607       // the reduction to the wider type before we branch to the original loop.
608       if (PhiTy != RdxDesc.getRecurrenceType())
609         ReducedPartRdx = RdxDesc.isSigned()
610                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
611                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
612     }
613 
614     return ReducedPartRdx;
615   }
616   case VPInstruction::ExtractFromEnd: {
617     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
618     unsigned Offset = CI->getZExtValue();
619     assert(Offset > 0 && "Offset from end must be positive");
620     Value *Res;
621     if (State.VF.isVector()) {
622       assert(Offset <= State.VF.getKnownMinValue() &&
623              "invalid offset to extract from");
624       // Extract lane VF - Offset from the operand.
625       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
626     } else {
627       assert(Offset <= 1 && "invalid offset to extract from");
628       Res = State.get(getOperand(0));
629     }
630     if (isa<ExtractElementInst>(Res))
631       Res->setName(Name);
632     return Res;
633   }
634   case VPInstruction::LogicalAnd: {
635     Value *A = State.get(getOperand(0));
636     Value *B = State.get(getOperand(1));
637     return Builder.CreateLogicalAnd(A, B, Name);
638   }
639   case VPInstruction::PtrAdd: {
640     assert(vputils::onlyFirstLaneUsed(this) &&
641            "can only generate first lane for PtrAdd");
642     Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
643     Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
644     return Builder.CreatePtrAdd(Ptr, Addend, Name);
645   }
646   case VPInstruction::ResumePhi: {
647     Value *IncomingFromVPlanPred =
648         State.get(getOperand(0), /* IsScalar */ true);
649     Value *IncomingFromOtherPreds =
650         State.get(getOperand(1), /* IsScalar */ true);
651     auto *NewPhi =
652         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
653     BasicBlock *VPlanPred =
654         State.CFG
655             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
656     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
657     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
658       assert(OtherPred != VPlanPred &&
659              "VPlan predecessors should not be connected yet");
660       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
661     }
662     return NewPhi;
663   }
664 
665   default:
666     llvm_unreachable("Unsupported opcode for instruction");
667   }
668 }
669 
670 bool VPInstruction::isVectorToScalar() const {
671   return getOpcode() == VPInstruction::ExtractFromEnd ||
672          getOpcode() == VPInstruction::ComputeReductionResult;
673 }
674 
675 bool VPInstruction::isSingleScalar() const {
676   return getOpcode() == VPInstruction::ResumePhi;
677 }
678 
679 #if !defined(NDEBUG)
680 bool VPInstruction::isFPMathOp() const {
681   // Inspired by FPMathOperator::classof. Notable differences are that we don't
682   // support Call, PHI and Select opcodes here yet.
683   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
684          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
685          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
686          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
687 }
688 #endif
689 
690 void VPInstruction::execute(VPTransformState &State) {
691   assert(!State.Lane && "VPInstruction executing an Lane");
692   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
693   assert((hasFastMathFlags() == isFPMathOp() ||
694           getOpcode() == Instruction::Select) &&
695          "Recipe not a FPMathOp but has fast-math flags?");
696   if (hasFastMathFlags())
697     State.Builder.setFastMathFlags(getFastMathFlags());
698   State.setDebugLocFrom(getDebugLoc());
699   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
700                                    (vputils::onlyFirstLaneUsed(this) ||
701                                     isVectorToScalar() || isSingleScalar());
702   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
703   if (GeneratesPerAllLanes) {
704     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
705          Lane != NumLanes; ++Lane) {
706       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
707       assert(GeneratedValue && "generatePerLane must produce a value");
708       State.set(this, GeneratedValue, VPLane(Lane));
709     }
710     return;
711   }
712 
713   Value *GeneratedValue = generate(State);
714   if (!hasResult())
715     return;
716   assert(GeneratedValue && "generate must produce a value");
717   assert(
718       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
719        State.VF.isScalar()) &&
720       "scalar value but not only first lane defined");
721   State.set(this, GeneratedValue,
722             /*IsScalar*/ GeneratesPerFirstLaneOnly);
723 }
724 
725 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
726   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
727   if (Instruction::isBinaryOp(getOpcode()))
728     return vputils::onlyFirstLaneUsed(this);
729 
730   switch (getOpcode()) {
731   default:
732     return false;
733   case Instruction::ICmp:
734   case VPInstruction::PtrAdd:
735     // TODO: Cover additional opcodes.
736     return vputils::onlyFirstLaneUsed(this);
737   case VPInstruction::ActiveLaneMask:
738   case VPInstruction::ExplicitVectorLength:
739   case VPInstruction::CalculateTripCountMinusVF:
740   case VPInstruction::CanonicalIVIncrementForPart:
741   case VPInstruction::BranchOnCount:
742   case VPInstruction::BranchOnCond:
743   case VPInstruction::ResumePhi:
744     return true;
745   };
746   llvm_unreachable("switch should return");
747 }
748 
749 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
750   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
751   if (Instruction::isBinaryOp(getOpcode()))
752     return vputils::onlyFirstPartUsed(this);
753 
754   switch (getOpcode()) {
755   default:
756     return false;
757   case Instruction::ICmp:
758   case Instruction::Select:
759     return vputils::onlyFirstPartUsed(this);
760   case VPInstruction::BranchOnCount:
761   case VPInstruction::BranchOnCond:
762   case VPInstruction::CanonicalIVIncrementForPart:
763     return true;
764   };
765   llvm_unreachable("switch should return");
766 }
767 
768 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
769 void VPInstruction::dump() const {
770   VPSlotTracker SlotTracker(getParent()->getPlan());
771   print(dbgs(), "", SlotTracker);
772 }
773 
774 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
775                           VPSlotTracker &SlotTracker) const {
776   O << Indent << "EMIT ";
777 
778   if (hasResult()) {
779     printAsOperand(O, SlotTracker);
780     O << " = ";
781   }
782 
783   switch (getOpcode()) {
784   case VPInstruction::Not:
785     O << "not";
786     break;
787   case VPInstruction::SLPLoad:
788     O << "combined load";
789     break;
790   case VPInstruction::SLPStore:
791     O << "combined store";
792     break;
793   case VPInstruction::ActiveLaneMask:
794     O << "active lane mask";
795     break;
796   case VPInstruction::ResumePhi:
797     O << "resume-phi";
798     break;
799   case VPInstruction::ExplicitVectorLength:
800     O << "EXPLICIT-VECTOR-LENGTH";
801     break;
802   case VPInstruction::FirstOrderRecurrenceSplice:
803     O << "first-order splice";
804     break;
805   case VPInstruction::BranchOnCond:
806     O << "branch-on-cond";
807     break;
808   case VPInstruction::CalculateTripCountMinusVF:
809     O << "TC > VF ? TC - VF : 0";
810     break;
811   case VPInstruction::CanonicalIVIncrementForPart:
812     O << "VF * Part +";
813     break;
814   case VPInstruction::BranchOnCount:
815     O << "branch-on-count";
816     break;
817   case VPInstruction::ExtractFromEnd:
818     O << "extract-from-end";
819     break;
820   case VPInstruction::ComputeReductionResult:
821     O << "compute-reduction-result";
822     break;
823   case VPInstruction::LogicalAnd:
824     O << "logical-and";
825     break;
826   case VPInstruction::PtrAdd:
827     O << "ptradd";
828     break;
829   default:
830     O << Instruction::getOpcodeName(getOpcode());
831   }
832 
833   printFlags(O);
834   printOperands(O, SlotTracker);
835 
836   if (auto DL = getDebugLoc()) {
837     O << ", !dbg ";
838     DL.print(O);
839   }
840 }
841 #endif
842 
843 void VPIRInstruction::execute(VPTransformState &State) {
844   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
845          "Only PHINodes can have extra operands");
846   if (getNumOperands() == 1) {
847     VPValue *ExitValue = getOperand(0);
848     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
849                     ? VPLane::getFirstLane()
850                     : VPLane::getLastLaneForVF(State.VF);
851     auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
852     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
853     // Set insertion point in PredBB in case an extract needs to be generated.
854     // TODO: Model extracts explicitly.
855     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
856     Value *V = State.get(ExitValue, VPLane(Lane));
857     auto *Phi = cast<PHINode>(&I);
858     Phi->addIncoming(V, PredBB);
859   }
860 
861   // Advance the insert point after the wrapped IR instruction. This allows
862   // interleaving VPIRInstructions and other recipes.
863   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
864 }
865 
866 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
867 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
868                             VPSlotTracker &SlotTracker) const {
869   O << Indent << "IR " << I;
870 
871   if (getNumOperands() != 0) {
872     assert(getNumOperands() == 1 && "can have at most 1 operand");
873     O << " (extra operand: ";
874     printOperands(O, SlotTracker);
875     O << ")";
876   }
877 }
878 #endif
879 
880 void VPWidenCallRecipe::execute(VPTransformState &State) {
881   assert(State.VF.isVector() && "not widening");
882   Function *CalledScalarFn = getCalledScalarFunction();
883   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
884          "DbgInfoIntrinsic should have been dropped during VPlan construction");
885   State.setDebugLocFrom(getDebugLoc());
886 
887   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
888   FunctionType *VFTy = nullptr;
889   if (Variant)
890     VFTy = Variant->getFunctionType();
891   SmallVector<Type *, 2> TysForDecl;
892   // Add return type if intrinsic is overloaded on it.
893   if (UseIntrinsic &&
894       isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
895     TysForDecl.push_back(VectorType::get(
896         CalledScalarFn->getReturnType()->getScalarType(), State.VF));
897   SmallVector<Value *, 4> Args;
898   for (const auto &I : enumerate(arg_operands())) {
899     // Some intrinsics have a scalar argument - don't replace it with a
900     // vector.
901     Value *Arg;
902     if (UseIntrinsic &&
903         isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
904       Arg = State.get(I.value(), VPLane(0));
905     // Some vectorized function variants may also take a scalar argument,
906     // e.g. linear parameters for pointers. This needs to be the scalar value
907     // from the start of the respective part when interleaving.
908     else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
909       Arg = State.get(I.value(), VPLane(0));
910     else
911       Arg = State.get(I.value());
912     if (UseIntrinsic &&
913         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
914       TysForDecl.push_back(Arg->getType());
915     Args.push_back(Arg);
916   }
917 
918   Function *VectorF;
919   if (UseIntrinsic) {
920     // Use vector version of the intrinsic.
921     Module *M = State.Builder.GetInsertBlock()->getModule();
922     VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
923     assert(VectorF && "Can't retrieve vector intrinsic.");
924   } else {
925 #ifndef NDEBUG
926     assert(Variant != nullptr && "Can't create vector function.");
927 #endif
928     VectorF = Variant;
929   }
930 
931   auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
932   SmallVector<OperandBundleDef, 1> OpBundles;
933   if (CI)
934     CI->getOperandBundlesAsDefs(OpBundles);
935 
936   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
937 
938   setFlags(V);
939 
940   if (!V->getType()->isVoidTy())
941     State.set(this, V);
942   State.addMetadata(V, CI);
943 }
944 
945 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
946                                                VPCostContext &Ctx) const {
947   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
948   if (Variant) {
949     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
950                                     Variant->getFunctionType()->params(),
951                                     CostKind);
952   }
953 
954   // Some backends analyze intrinsic arguments to determine cost. Use the
955   // underlying value for the operand if it has one. Otherwise try to use the
956   // operand of the underlying call instruction, if there is one. Otherwise
957   // clear Arguments.
958   // TODO: Rework TTI interface to be independent of concrete IR values.
959   SmallVector<const Value *> Arguments;
960   for (const auto &[Idx, Op] : enumerate(operands())) {
961     auto *V = Op->getUnderlyingValue();
962     if (!V) {
963       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
964         Arguments.push_back(UI->getArgOperand(Idx));
965         continue;
966       }
967       Arguments.clear();
968       break;
969     }
970     Arguments.push_back(V);
971   }
972 
973   Type *RetTy =
974       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
975   SmallVector<Type *> ParamTys;
976   for (unsigned I = 0; I != getNumOperands(); ++I)
977     ParamTys.push_back(
978         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
979 
980   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
981   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
982   IntrinsicCostAttributes CostAttrs(
983       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
984       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
985   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
986 }
987 
988 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
989 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
990                               VPSlotTracker &SlotTracker) const {
991   O << Indent << "WIDEN-CALL ";
992 
993   Function *CalledFn = getCalledScalarFunction();
994   if (CalledFn->getReturnType()->isVoidTy())
995     O << "void ";
996   else {
997     printAsOperand(O, SlotTracker);
998     O << " = ";
999   }
1000 
1001   O << "call";
1002   printFlags(O);
1003   O << "  @" << CalledFn->getName() << "(";
1004   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1005     Op->printAsOperand(O, SlotTracker);
1006   });
1007   O << ")";
1008 
1009   if (VectorIntrinsicID)
1010     O << " (using vector intrinsic)";
1011   else {
1012     O << " (using library function";
1013     if (Variant->hasName())
1014       O << ": " << Variant->getName();
1015     O << ")";
1016   }
1017 }
1018 #endif
1019 
1020 void VPHistogramRecipe::execute(VPTransformState &State) {
1021   State.setDebugLocFrom(getDebugLoc());
1022   IRBuilderBase &Builder = State.Builder;
1023 
1024   Value *Address = State.get(getOperand(0));
1025   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1026   VectorType *VTy = cast<VectorType>(Address->getType());
1027 
1028   // The histogram intrinsic requires a mask even if the recipe doesn't;
1029   // if the mask operand was omitted then all lanes should be executed and
1030   // we just need to synthesize an all-true mask.
1031   Value *Mask = nullptr;
1032   if (VPValue *VPMask = getMask())
1033     Mask = State.get(VPMask);
1034   else
1035     Mask =
1036         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1037 
1038   // If this is a subtract, we want to invert the increment amount. We may
1039   // add a separate intrinsic in future, but for now we'll try this.
1040   if (Opcode == Instruction::Sub)
1041     IncAmt = Builder.CreateNeg(IncAmt);
1042   else
1043     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1044 
1045   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1046                                 {VTy, IncAmt->getType()},
1047                                 {Address, IncAmt, Mask});
1048 }
1049 
1050 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1051                                                VPCostContext &Ctx) const {
1052   // FIXME: Take the gather and scatter into account as well. For now we're
1053   //        generating the same cost as the fallback path, but we'll likely
1054   //        need to create a new TTI method for determining the cost, including
1055   //        whether we can use base + vec-of-smaller-indices or just
1056   //        vec-of-pointers.
1057   assert(VF.isVector() && "Invalid VF for histogram cost");
1058   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1059   VPValue *IncAmt = getOperand(1);
1060   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1061   VectorType *VTy = VectorType::get(IncTy, VF);
1062 
1063   // Assume that a non-constant update value (or a constant != 1) requires
1064   // a multiply, and add that into the cost.
1065   InstructionCost MulCost =
1066       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
1067   if (IncAmt->isLiveIn()) {
1068     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1069 
1070     if (CI && CI->getZExtValue() == 1)
1071       MulCost = TTI::TCC_Free;
1072   }
1073 
1074   // Find the cost of the histogram operation itself.
1075   Type *PtrTy = VectorType::get(AddressTy, VF);
1076   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1077   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1078                               Type::getVoidTy(Ctx.LLVMCtx),
1079                               {PtrTy, IncTy, MaskTy});
1080 
1081   // Add the costs together with the add/sub operation.
1082   return Ctx.TTI.getIntrinsicInstrCost(
1083              ICA, TargetTransformInfo::TCK_RecipThroughput) +
1084          MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
1085 }
1086 
1087 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1088 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1089                               VPSlotTracker &SlotTracker) const {
1090   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1091   getOperand(0)->printAsOperand(O, SlotTracker);
1092 
1093   if (Opcode == Instruction::Sub)
1094     O << ", dec: ";
1095   else {
1096     assert(Opcode == Instruction::Add);
1097     O << ", inc: ";
1098   }
1099   getOperand(1)->printAsOperand(O, SlotTracker);
1100 
1101   if (VPValue *Mask = getMask()) {
1102     O << ", mask: ";
1103     Mask->printAsOperand(O, SlotTracker);
1104   }
1105 }
1106 
1107 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1108                                 VPSlotTracker &SlotTracker) const {
1109   O << Indent << "WIDEN-SELECT ";
1110   printAsOperand(O, SlotTracker);
1111   O << " = select ";
1112   getOperand(0)->printAsOperand(O, SlotTracker);
1113   O << ", ";
1114   getOperand(1)->printAsOperand(O, SlotTracker);
1115   O << ", ";
1116   getOperand(2)->printAsOperand(O, SlotTracker);
1117   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1118 }
1119 #endif
1120 
1121 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1122   State.setDebugLocFrom(getDebugLoc());
1123 
1124   // The condition can be loop invariant but still defined inside the
1125   // loop. This means that we can't just use the original 'cond' value.
1126   // We have to take the 'vectorized' value and pick the first lane.
1127   // Instcombine will make this a no-op.
1128   auto *InvarCond =
1129       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1130 
1131   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1132   Value *Op0 = State.get(getOperand(1));
1133   Value *Op1 = State.get(getOperand(2));
1134   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1135   State.set(this, Sel);
1136   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1137 }
1138 
1139 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1140     const FastMathFlags &FMF) {
1141   AllowReassoc = FMF.allowReassoc();
1142   NoNaNs = FMF.noNaNs();
1143   NoInfs = FMF.noInfs();
1144   NoSignedZeros = FMF.noSignedZeros();
1145   AllowReciprocal = FMF.allowReciprocal();
1146   AllowContract = FMF.allowContract();
1147   ApproxFunc = FMF.approxFunc();
1148 }
1149 
1150 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1151 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1152   switch (OpType) {
1153   case OperationType::Cmp:
1154     O << " " << CmpInst::getPredicateName(getPredicate());
1155     break;
1156   case OperationType::DisjointOp:
1157     if (DisjointFlags.IsDisjoint)
1158       O << " disjoint";
1159     break;
1160   case OperationType::PossiblyExactOp:
1161     if (ExactFlags.IsExact)
1162       O << " exact";
1163     break;
1164   case OperationType::OverflowingBinOp:
1165     if (WrapFlags.HasNUW)
1166       O << " nuw";
1167     if (WrapFlags.HasNSW)
1168       O << " nsw";
1169     break;
1170   case OperationType::FPMathOp:
1171     getFastMathFlags().print(O);
1172     break;
1173   case OperationType::GEPOp:
1174     if (GEPFlags.IsInBounds)
1175       O << " inbounds";
1176     break;
1177   case OperationType::NonNegOp:
1178     if (NonNegFlags.NonNeg)
1179       O << " nneg";
1180     break;
1181   case OperationType::Other:
1182     break;
1183   }
1184   if (getNumOperands() > 0)
1185     O << " ";
1186 }
1187 #endif
1188 
1189 void VPWidenRecipe::execute(VPTransformState &State) {
1190   State.setDebugLocFrom(getDebugLoc());
1191   auto &Builder = State.Builder;
1192   switch (Opcode) {
1193   case Instruction::Call:
1194   case Instruction::Br:
1195   case Instruction::PHI:
1196   case Instruction::GetElementPtr:
1197   case Instruction::Select:
1198     llvm_unreachable("This instruction is handled by a different recipe.");
1199   case Instruction::UDiv:
1200   case Instruction::SDiv:
1201   case Instruction::SRem:
1202   case Instruction::URem:
1203   case Instruction::Add:
1204   case Instruction::FAdd:
1205   case Instruction::Sub:
1206   case Instruction::FSub:
1207   case Instruction::FNeg:
1208   case Instruction::Mul:
1209   case Instruction::FMul:
1210   case Instruction::FDiv:
1211   case Instruction::FRem:
1212   case Instruction::Shl:
1213   case Instruction::LShr:
1214   case Instruction::AShr:
1215   case Instruction::And:
1216   case Instruction::Or:
1217   case Instruction::Xor: {
1218     // Just widen unops and binops.
1219     SmallVector<Value *, 2> Ops;
1220     for (VPValue *VPOp : operands())
1221       Ops.push_back(State.get(VPOp));
1222 
1223     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1224 
1225     if (auto *VecOp = dyn_cast<Instruction>(V))
1226       setFlags(VecOp);
1227 
1228     // Use this vector value for all users of the original instruction.
1229     State.set(this, V);
1230     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1231     break;
1232   }
1233   case Instruction::Freeze: {
1234     Value *Op = State.get(getOperand(0));
1235 
1236     Value *Freeze = Builder.CreateFreeze(Op);
1237     State.set(this, Freeze);
1238     break;
1239   }
1240   case Instruction::ICmp:
1241   case Instruction::FCmp: {
1242     // Widen compares. Generate vector compares.
1243     bool FCmp = Opcode == Instruction::FCmp;
1244     Value *A = State.get(getOperand(0));
1245     Value *B = State.get(getOperand(1));
1246     Value *C = nullptr;
1247     if (FCmp) {
1248       // Propagate fast math flags.
1249       IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1250       if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1251         Builder.setFastMathFlags(I->getFastMathFlags());
1252       C = Builder.CreateFCmp(getPredicate(), A, B);
1253     } else {
1254       C = Builder.CreateICmp(getPredicate(), A, B);
1255     }
1256     State.set(this, C);
1257     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1258     break;
1259   }
1260   default:
1261     // This instruction is not vectorized by simple widening.
1262     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1263                       << Instruction::getOpcodeName(Opcode));
1264     llvm_unreachable("Unhandled instruction!");
1265   } // end of switch.
1266 
1267 #if !defined(NDEBUG)
1268   // Verify that VPlan type inference results agree with the type of the
1269   // generated values.
1270   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1271              State.get(this)->getType() &&
1272          "inferred type and type from generated instructions do not match");
1273 #endif
1274 }
1275 
1276 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1277                                            VPCostContext &Ctx) const {
1278   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1279   switch (Opcode) {
1280   case Instruction::FNeg: {
1281     Type *VectorTy =
1282         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1283     return Ctx.TTI.getArithmeticInstrCost(
1284         Opcode, VectorTy, CostKind,
1285         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1286         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1287   }
1288 
1289   case Instruction::UDiv:
1290   case Instruction::SDiv:
1291   case Instruction::SRem:
1292   case Instruction::URem:
1293     // More complex computation, let the legacy cost-model handle this for now.
1294     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1295   case Instruction::Add:
1296   case Instruction::FAdd:
1297   case Instruction::Sub:
1298   case Instruction::FSub:
1299   case Instruction::Mul:
1300   case Instruction::FMul:
1301   case Instruction::FDiv:
1302   case Instruction::FRem:
1303   case Instruction::Shl:
1304   case Instruction::LShr:
1305   case Instruction::AShr:
1306   case Instruction::And:
1307   case Instruction::Or:
1308   case Instruction::Xor: {
1309     VPValue *RHS = getOperand(1);
1310     // Certain instructions can be cheaper to vectorize if they have a constant
1311     // second vector operand. One example of this are shifts on x86.
1312     TargetTransformInfo::OperandValueInfo RHSInfo = {
1313         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1314     if (RHS->isLiveIn())
1315       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1316 
1317     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1318         getOperand(1)->isDefinedOutsideLoopRegions())
1319       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1320     Type *VectorTy =
1321         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1322     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1323 
1324     SmallVector<const Value *, 4> Operands;
1325     if (CtxI)
1326       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1327     return Ctx.TTI.getArithmeticInstrCost(
1328         Opcode, VectorTy, CostKind,
1329         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1330         RHSInfo, Operands, CtxI, &Ctx.TLI);
1331   }
1332   case Instruction::Freeze: {
1333     // This opcode is unknown. Assume that it is the same as 'mul'.
1334     Type *VectorTy =
1335         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1336     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1337   }
1338   case Instruction::ICmp:
1339   case Instruction::FCmp: {
1340     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1341     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1342     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1343                                       CostKind,
1344                                       {TTI::OK_AnyValue, TTI::OP_None},
1345                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1346   }
1347   default:
1348     llvm_unreachable("Unsupported opcode for instruction");
1349   }
1350 }
1351 
1352 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1353   unsigned Opcode = getOpcode();
1354   // TODO: Support other opcodes
1355   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1356     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1357 
1358   State.setDebugLocFrom(getDebugLoc());
1359 
1360   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1361          "VPWidenEVLRecipe should not be used for scalars");
1362 
1363   VPValue *EVL = getEVL();
1364   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1365   IRBuilderBase &BuilderIR = State.Builder;
1366   VectorBuilder Builder(BuilderIR);
1367   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1368 
1369   SmallVector<Value *, 4> Ops;
1370   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1371     VPValue *VPOp = getOperand(I);
1372     Ops.push_back(State.get(VPOp));
1373   }
1374 
1375   Builder.setMask(Mask).setEVL(EVLArg);
1376   Value *VPInst =
1377       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1378   // Currently vp-intrinsics only accept FMF flags.
1379   // TODO: Enable other flags when support is added.
1380   if (isa<FPMathOperator>(VPInst))
1381     setFlags(cast<Instruction>(VPInst));
1382 
1383   State.set(this, VPInst);
1384   State.addMetadata(VPInst,
1385                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1386 }
1387 
1388 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1389 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1390                           VPSlotTracker &SlotTracker) const {
1391   O << Indent << "WIDEN ";
1392   printAsOperand(O, SlotTracker);
1393   O << " = " << Instruction::getOpcodeName(Opcode);
1394   printFlags(O);
1395   printOperands(O, SlotTracker);
1396 }
1397 
1398 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1399                              VPSlotTracker &SlotTracker) const {
1400   O << Indent << "WIDEN ";
1401   printAsOperand(O, SlotTracker);
1402   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1403   printFlags(O);
1404   printOperands(O, SlotTracker);
1405 }
1406 #endif
1407 
1408 void VPWidenCastRecipe::execute(VPTransformState &State) {
1409   State.setDebugLocFrom(getDebugLoc());
1410   auto &Builder = State.Builder;
1411   /// Vectorize casts.
1412   assert(State.VF.isVector() && "Not vectorizing?");
1413   Type *DestTy = VectorType::get(getResultType(), State.VF);
1414   VPValue *Op = getOperand(0);
1415   Value *A = State.get(Op);
1416   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1417   State.set(this, Cast);
1418   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1419 }
1420 
1421 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1422 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1423                               VPSlotTracker &SlotTracker) const {
1424   O << Indent << "WIDEN-CAST ";
1425   printAsOperand(O, SlotTracker);
1426   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1427   printFlags(O);
1428   printOperands(O, SlotTracker);
1429   O << " to " << *getResultType();
1430 }
1431 #endif
1432 
1433 /// This function adds
1434 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1435 /// to each vector element of Val. The sequence starts at StartIndex.
1436 /// \p Opcode is relevant for FP induction variable.
1437 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1438                             Instruction::BinaryOps BinOp, ElementCount VF,
1439                             IRBuilderBase &Builder) {
1440   assert(VF.isVector() && "only vector VFs are supported");
1441 
1442   // Create and check the types.
1443   auto *ValVTy = cast<VectorType>(Val->getType());
1444   ElementCount VLen = ValVTy->getElementCount();
1445 
1446   Type *STy = Val->getType()->getScalarType();
1447   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1448          "Induction Step must be an integer or FP");
1449   assert(Step->getType() == STy && "Step has wrong type");
1450 
1451   SmallVector<Constant *, 8> Indices;
1452 
1453   // Create a vector of consecutive numbers from zero to VF.
1454   VectorType *InitVecValVTy = ValVTy;
1455   if (STy->isFloatingPointTy()) {
1456     Type *InitVecValSTy =
1457         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1458     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1459   }
1460   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1461 
1462   // Splat the StartIdx
1463   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1464 
1465   if (STy->isIntegerTy()) {
1466     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1467     Step = Builder.CreateVectorSplat(VLen, Step);
1468     assert(Step->getType() == Val->getType() && "Invalid step vec");
1469     // FIXME: The newly created binary instructions should contain nsw/nuw
1470     // flags, which can be found from the original scalar operations.
1471     Step = Builder.CreateMul(InitVec, Step);
1472     return Builder.CreateAdd(Val, Step, "induction");
1473   }
1474 
1475   // Floating point induction.
1476   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1477          "Binary Opcode should be specified for FP induction");
1478   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1479   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1480 
1481   Step = Builder.CreateVectorSplat(VLen, Step);
1482   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1483   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1484 }
1485 
1486 /// A helper function that returns an integer or floating-point constant with
1487 /// value C.
1488 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1489   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1490                            : ConstantFP::get(Ty, C);
1491 }
1492 
1493 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1494   assert(!State.Lane && "Int or FP induction being replicated.");
1495 
1496   Value *Start = getStartValue()->getLiveInIRValue();
1497   const InductionDescriptor &ID = getInductionDescriptor();
1498   TruncInst *Trunc = getTruncInst();
1499   IRBuilderBase &Builder = State.Builder;
1500   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1501   assert(State.VF.isVector() && "must have vector VF");
1502 
1503   // The value from the original loop to which we are mapping the new induction
1504   // variable.
1505   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1506 
1507   // Fast-math-flags propagate from the original induction instruction.
1508   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1509   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1510     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1511 
1512   // Now do the actual transformations, and start with fetching the step value.
1513   Value *Step = State.get(getStepValue(), VPLane(0));
1514 
1515   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1516          "Expected either an induction phi-node or a truncate of it!");
1517 
1518   // Construct the initial value of the vector IV in the vector loop preheader
1519   auto CurrIP = Builder.saveIP();
1520   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1521   Builder.SetInsertPoint(VectorPH->getTerminator());
1522   if (isa<TruncInst>(EntryVal)) {
1523     assert(Start->getType()->isIntegerTy() &&
1524            "Truncation requires an integer type");
1525     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1526     Step = Builder.CreateTrunc(Step, TruncType);
1527     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1528   }
1529 
1530   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1531   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1532   Value *SteppedStart = getStepVector(
1533       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1534 
1535   // We create vector phi nodes for both integer and floating-point induction
1536   // variables. Here, we determine the kind of arithmetic we will perform.
1537   Instruction::BinaryOps AddOp;
1538   Instruction::BinaryOps MulOp;
1539   if (Step->getType()->isIntegerTy()) {
1540     AddOp = Instruction::Add;
1541     MulOp = Instruction::Mul;
1542   } else {
1543     AddOp = ID.getInductionOpcode();
1544     MulOp = Instruction::FMul;
1545   }
1546 
1547   Value *SplatVF;
1548   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1549     // The recipe has been unrolled. In that case, fetch the splat value for the
1550     // induction increment.
1551     SplatVF = State.get(SplatVFOperand);
1552   } else {
1553     // Multiply the vectorization factor by the step using integer or
1554     // floating-point arithmetic as appropriate.
1555     Type *StepType = Step->getType();
1556     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1557     if (Step->getType()->isFloatingPointTy())
1558       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1559     else
1560       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1561     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1562 
1563     // Create a vector splat to use in the induction update.
1564     //
1565     // FIXME: If the step is non-constant, we create the vector splat with
1566     //        IRBuilder. IRBuilder can constant-fold the multiply, but it
1567     //        doesn't handle a constant vector splat.
1568     SplatVF = isa<Constant>(Mul)
1569                   ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1570                   : Builder.CreateVectorSplat(State.VF, Mul);
1571   }
1572 
1573   Builder.restoreIP(CurrIP);
1574 
1575   // We may need to add the step a number of times, depending on the unroll
1576   // factor. The last of those goes into the PHI.
1577   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1578   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1579   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1580   State.set(this, VecInd);
1581 
1582   Instruction *LastInduction = cast<Instruction>(
1583       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1584   if (isa<TruncInst>(EntryVal))
1585     State.addMetadata(LastInduction, EntryVal);
1586   LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1587 
1588   VecInd->addIncoming(SteppedStart, VectorPH);
1589   // Add induction update using an incorrect block temporarily. The phi node
1590   // will be fixed after VPlan execution. Note that at this point the latch
1591   // block cannot be used, as it does not exist yet.
1592   // TODO: Model increment value in VPlan, by turning the recipe into a
1593   // multi-def and a subclass of VPHeaderPHIRecipe.
1594   VecInd->addIncoming(LastInduction, VectorPH);
1595 }
1596 
1597 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1598 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1599                                           VPSlotTracker &SlotTracker) const {
1600   O << Indent << "WIDEN-INDUCTION";
1601   if (getTruncInst()) {
1602     O << "\\l\"";
1603     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1604     O << " +\n" << Indent << "\"  ";
1605     getVPValue(0)->printAsOperand(O, SlotTracker);
1606   } else
1607     O << " " << VPlanIngredient(IV);
1608 
1609   O << ", ";
1610   getStepValue()->printAsOperand(O, SlotTracker);
1611 
1612   O << ", ";
1613   getVFValue()->printAsOperand(O, SlotTracker);
1614 }
1615 #endif
1616 
1617 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1618   // The step may be defined by a recipe in the preheader (e.g. if it requires
1619   // SCEV expansion), but for the canonical induction the step is required to be
1620   // 1, which is represented as live-in.
1621   if (getStepValue()->getDefiningRecipe())
1622     return false;
1623   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1624   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1625   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1626   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1627          getScalarType() == CanIV->getScalarType();
1628 }
1629 
1630 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1631 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1632                               VPSlotTracker &SlotTracker) const {
1633   O << Indent;
1634   printAsOperand(O, SlotTracker);
1635   O << " = DERIVED-IV ";
1636   getStartValue()->printAsOperand(O, SlotTracker);
1637   O << " + ";
1638   getOperand(1)->printAsOperand(O, SlotTracker);
1639   O << " * ";
1640   getStepValue()->printAsOperand(O, SlotTracker);
1641 }
1642 #endif
1643 
1644 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1645   // Fast-math-flags propagate from the original induction instruction.
1646   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1647   if (hasFastMathFlags())
1648     State.Builder.setFastMathFlags(getFastMathFlags());
1649 
1650   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1651   /// variable on which to base the steps, \p Step is the size of the step.
1652 
1653   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1654   Value *Step = State.get(getStepValue(), VPLane(0));
1655   IRBuilderBase &Builder = State.Builder;
1656 
1657   // Ensure step has the same type as that of scalar IV.
1658   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1659   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1660 
1661   // We build scalar steps for both integer and floating-point induction
1662   // variables. Here, we determine the kind of arithmetic we will perform.
1663   Instruction::BinaryOps AddOp;
1664   Instruction::BinaryOps MulOp;
1665   if (BaseIVTy->isIntegerTy()) {
1666     AddOp = Instruction::Add;
1667     MulOp = Instruction::Mul;
1668   } else {
1669     AddOp = InductionOpcode;
1670     MulOp = Instruction::FMul;
1671   }
1672 
1673   // Determine the number of scalars we need to generate for each unroll
1674   // iteration.
1675   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1676   // Compute the scalar steps and save the results in State.
1677   Type *IntStepTy =
1678       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1679   Type *VecIVTy = nullptr;
1680   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1681   if (!FirstLaneOnly && State.VF.isScalable()) {
1682     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1683     UnitStepVec =
1684         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1685     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1686     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1687   }
1688 
1689   unsigned StartLane = 0;
1690   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1691   if (State.Lane) {
1692     StartLane = State.Lane->getKnownLane();
1693     EndLane = StartLane + 1;
1694   }
1695   Value *StartIdx0 =
1696       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1697 
1698   if (!FirstLaneOnly && State.VF.isScalable()) {
1699     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1700     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1701     if (BaseIVTy->isFloatingPointTy())
1702       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1703     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1704     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1705     State.set(this, Add);
1706     // It's useful to record the lane values too for the known minimum number
1707     // of elements so we do those below. This improves the code quality when
1708     // trying to extract the first element, for example.
1709   }
1710 
1711   if (BaseIVTy->isFloatingPointTy())
1712     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1713 
1714   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1715     Value *StartIdx = Builder.CreateBinOp(
1716         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1717     // The step returned by `createStepForVF` is a runtime-evaluated value
1718     // when VF is scalable. Otherwise, it should be folded into a Constant.
1719     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1720            "Expected StartIdx to be folded to a constant when VF is not "
1721            "scalable");
1722     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1723     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1724     State.set(this, Add, VPLane(Lane));
1725   }
1726 }
1727 
1728 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1729 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1730                                   VPSlotTracker &SlotTracker) const {
1731   O << Indent;
1732   printAsOperand(O, SlotTracker);
1733   O << " = SCALAR-STEPS ";
1734   printOperands(O, SlotTracker);
1735 }
1736 #endif
1737 
1738 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1739   assert(State.VF.isVector() && "not widening");
1740   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1741   // Construct a vector GEP by widening the operands of the scalar GEP as
1742   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1743   // results in a vector of pointers when at least one operand of the GEP
1744   // is vector-typed. Thus, to keep the representation compact, we only use
1745   // vector-typed operands for loop-varying values.
1746 
1747   if (areAllOperandsInvariant()) {
1748     // If we are vectorizing, but the GEP has only loop-invariant operands,
1749     // the GEP we build (by only using vector-typed operands for
1750     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1751     // produce a vector of pointers, we need to either arbitrarily pick an
1752     // operand to broadcast, or broadcast a clone of the original GEP.
1753     // Here, we broadcast a clone of the original.
1754     //
1755     // TODO: If at some point we decide to scalarize instructions having
1756     //       loop-invariant operands, this special case will no longer be
1757     //       required. We would add the scalarization decision to
1758     //       collectLoopScalars() and teach getVectorValue() to broadcast
1759     //       the lane-zero scalar value.
1760     SmallVector<Value *> Ops;
1761     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1762       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1763 
1764     auto *NewGEP =
1765         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1766                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1767     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1768     State.set(this, Splat);
1769     State.addMetadata(Splat, GEP);
1770   } else {
1771     // If the GEP has at least one loop-varying operand, we are sure to
1772     // produce a vector of pointers unless VF is scalar.
1773     // The pointer operand of the new GEP. If it's loop-invariant, we
1774     // won't broadcast it.
1775     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
1776                                          : State.get(getOperand(0));
1777 
1778     // Collect all the indices for the new GEP. If any index is
1779     // loop-invariant, we won't broadcast it.
1780     SmallVector<Value *, 4> Indices;
1781     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1782       VPValue *Operand = getOperand(I);
1783       if (isIndexLoopInvariant(I - 1))
1784         Indices.push_back(State.get(Operand, VPLane(0)));
1785       else
1786         Indices.push_back(State.get(Operand));
1787     }
1788 
1789     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1790     // but it should be a vector, otherwise.
1791     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1792                                            Indices, "", isInBounds());
1793     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1794            "NewGEP is not a pointer vector");
1795     State.set(this, NewGEP);
1796     State.addMetadata(NewGEP, GEP);
1797   }
1798 }
1799 
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1802                              VPSlotTracker &SlotTracker) const {
1803   O << Indent << "WIDEN-GEP ";
1804   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1805   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1806     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1807 
1808   O << " ";
1809   printAsOperand(O, SlotTracker);
1810   O << " = getelementptr";
1811   printFlags(O);
1812   printOperands(O, SlotTracker);
1813 }
1814 #endif
1815 
1816 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1817   auto &Builder = State.Builder;
1818   State.setDebugLocFrom(getDebugLoc());
1819   unsigned CurrentPart = getUnrollPart(*this);
1820   // Use i32 for the gep index type when the value is constant,
1821   // or query DataLayout for a more suitable index type otherwise.
1822   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
1823   Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0)
1824                       ? DL.getIndexType(Builder.getPtrTy(0))
1825                       : Builder.getInt32Ty();
1826   Value *Ptr = State.get(getOperand(0), VPLane(0));
1827   bool InBounds = isInBounds();
1828 
1829   Value *ResultPtr = nullptr;
1830   if (IsReverse) {
1831     // If the address is consecutive but reversed, then the
1832     // wide store needs to start at the last vector element.
1833     // RunTimeVF =  VScale * VF.getKnownMinValue()
1834     // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1835     Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1836     // NumElt = -CurrentPart * RunTimeVF
1837     Value *NumElt = Builder.CreateMul(
1838         ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
1839     // LastLane = 1 - RunTimeVF
1840     Value *LastLane =
1841         Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1842     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1843     ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds);
1844   } else {
1845     Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
1846     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1847   }
1848 
1849   State.set(this, ResultPtr, /*IsScalar*/ true);
1850 }
1851 
1852 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1853 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1854                                   VPSlotTracker &SlotTracker) const {
1855   O << Indent;
1856   printAsOperand(O, SlotTracker);
1857   O << " = vector-pointer ";
1858   if (IsReverse)
1859     O << "(reverse) ";
1860 
1861   printOperands(O, SlotTracker);
1862 }
1863 #endif
1864 
1865 void VPBlendRecipe::execute(VPTransformState &State) {
1866   assert(isNormalized() && "Expected blend to be normalized!");
1867   State.setDebugLocFrom(getDebugLoc());
1868   // We know that all PHIs in non-header blocks are converted into
1869   // selects, so we don't have to worry about the insertion order and we
1870   // can just use the builder.
1871   // At this point we generate the predication tree. There may be
1872   // duplications since this is a simple recursive scan, but future
1873   // optimizations will clean it up.
1874 
1875   unsigned NumIncoming = getNumIncomingValues();
1876 
1877   // Generate a sequence of selects of the form:
1878   // SELECT(Mask3, In3,
1879   //        SELECT(Mask2, In2,
1880   //               SELECT(Mask1, In1,
1881   //                      In0)))
1882   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1883   // are essentially undef are taken from In0.
1884   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1885   Value *Result = nullptr;
1886   for (unsigned In = 0; In < NumIncoming; ++In) {
1887     // We might have single edge PHIs (blocks) - use an identity
1888     // 'select' for the first PHI operand.
1889     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
1890     if (In == 0)
1891       Result = In0; // Initialize with the first incoming value.
1892     else {
1893       // Select between the current value and the previous incoming edge
1894       // based on the incoming mask.
1895       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
1896       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
1897     }
1898   }
1899   State.set(this, Result, OnlyFirstLaneUsed);
1900 }
1901 
1902 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1903 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1904                           VPSlotTracker &SlotTracker) const {
1905   O << Indent << "BLEND ";
1906   printAsOperand(O, SlotTracker);
1907   O << " =";
1908   if (getNumIncomingValues() == 1) {
1909     // Not a User of any mask: not really blending, this is a
1910     // single-predecessor phi.
1911     O << " ";
1912     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1913   } else {
1914     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1915       O << " ";
1916       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1917       if (I == 0)
1918         continue;
1919       O << "/";
1920       getMask(I)->printAsOperand(O, SlotTracker);
1921     }
1922   }
1923 }
1924 #endif
1925 
1926 void VPReductionRecipe::execute(VPTransformState &State) {
1927   assert(!State.Lane && "Reduction being replicated.");
1928   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1929   RecurKind Kind = RdxDesc.getRecurrenceKind();
1930   // Propagate the fast-math flags carried by the underlying instruction.
1931   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1932   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1933   Value *NewVecOp = State.get(getVecOp());
1934   if (VPValue *Cond = getCondOp()) {
1935     Value *NewCond = State.get(Cond, State.VF.isScalar());
1936     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1937     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1938 
1939     Value *Start;
1940     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1941       Start = RdxDesc.getRecurrenceStartValue();
1942     else
1943       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1944                                           RdxDesc.getFastMathFlags());
1945     if (State.VF.isVector())
1946       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
1947 
1948     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1949     NewVecOp = Select;
1950   }
1951   Value *NewRed;
1952   Value *NextInChain;
1953   if (IsOrdered) {
1954     if (State.VF.isVector())
1955       NewRed =
1956           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
1957     else
1958       NewRed = State.Builder.CreateBinOp(
1959           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1960           NewVecOp);
1961     PrevInChain = NewRed;
1962     NextInChain = NewRed;
1963   } else {
1964     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1965     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1966     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1967       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1968                                    NewRed, PrevInChain);
1969     else
1970       NextInChain = State.Builder.CreateBinOp(
1971           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1972   }
1973   State.set(this, NextInChain, /*IsScalar*/ true);
1974 }
1975 
1976 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1977   assert(!State.Lane && "Reduction being replicated.");
1978 
1979   auto &Builder = State.Builder;
1980   // Propagate the fast-math flags carried by the underlying instruction.
1981   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1982   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1983   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1984 
1985   RecurKind Kind = RdxDesc.getRecurrenceKind();
1986   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
1987   Value *VecOp = State.get(getVecOp());
1988   Value *EVL = State.get(getEVL(), VPLane(0));
1989 
1990   VectorBuilder VBuilder(Builder);
1991   VBuilder.setEVL(EVL);
1992   Value *Mask;
1993   // TODO: move the all-true mask generation into VectorBuilder.
1994   if (VPValue *CondOp = getCondOp())
1995     Mask = State.get(CondOp);
1996   else
1997     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1998   VBuilder.setMask(Mask);
1999 
2000   Value *NewRed;
2001   if (isOrdered()) {
2002     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2003   } else {
2004     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2005     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2006       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2007     else
2008       NewRed = Builder.CreateBinOp(
2009           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
2010   }
2011   State.set(this, NewRed, /*IsScalar*/ true);
2012 }
2013 
2014 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2015 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2016                               VPSlotTracker &SlotTracker) const {
2017   O << Indent << "REDUCE ";
2018   printAsOperand(O, SlotTracker);
2019   O << " = ";
2020   getChainOp()->printAsOperand(O, SlotTracker);
2021   O << " +";
2022   if (isa<FPMathOperator>(getUnderlyingInstr()))
2023     O << getUnderlyingInstr()->getFastMathFlags();
2024   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2025   getVecOp()->printAsOperand(O, SlotTracker);
2026   if (isConditional()) {
2027     O << ", ";
2028     getCondOp()->printAsOperand(O, SlotTracker);
2029   }
2030   O << ")";
2031   if (RdxDesc.IntermediateStore)
2032     O << " (with final reduction value stored in invariant address sank "
2033          "outside of loop)";
2034 }
2035 
2036 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2037                                  VPSlotTracker &SlotTracker) const {
2038   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2039   O << Indent << "REDUCE ";
2040   printAsOperand(O, SlotTracker);
2041   O << " = ";
2042   getChainOp()->printAsOperand(O, SlotTracker);
2043   O << " +";
2044   if (isa<FPMathOperator>(getUnderlyingInstr()))
2045     O << getUnderlyingInstr()->getFastMathFlags();
2046   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2047   getVecOp()->printAsOperand(O, SlotTracker);
2048   O << ", ";
2049   getEVL()->printAsOperand(O, SlotTracker);
2050   if (isConditional()) {
2051     O << ", ";
2052     getCondOp()->printAsOperand(O, SlotTracker);
2053   }
2054   O << ")";
2055   if (RdxDesc.IntermediateStore)
2056     O << " (with final reduction value stored in invariant address sank "
2057          "outside of loop)";
2058 }
2059 #endif
2060 
2061 bool VPReplicateRecipe::shouldPack() const {
2062   // Find if the recipe is used by a widened recipe via an intervening
2063   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2064   return any_of(users(), [](const VPUser *U) {
2065     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2066       return any_of(PredR->users(), [PredR](const VPUser *U) {
2067         return !U->usesScalars(PredR);
2068       });
2069     return false;
2070   });
2071 }
2072 
2073 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2074 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2075                               VPSlotTracker &SlotTracker) const {
2076   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2077 
2078   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2079     printAsOperand(O, SlotTracker);
2080     O << " = ";
2081   }
2082   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2083     O << "call";
2084     printFlags(O);
2085     O << "@" << CB->getCalledFunction()->getName() << "(";
2086     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2087                     O, [&O, &SlotTracker](VPValue *Op) {
2088                       Op->printAsOperand(O, SlotTracker);
2089                     });
2090     O << ")";
2091   } else {
2092     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2093     printFlags(O);
2094     printOperands(O, SlotTracker);
2095   }
2096 
2097   if (shouldPack())
2098     O << " (S->V)";
2099 }
2100 #endif
2101 
2102 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2103   assert(vputils::onlyFirstLaneUsed(this) &&
2104          "Codegen only implemented for first lane.");
2105   switch (Opcode) {
2106   case Instruction::SExt:
2107   case Instruction::ZExt:
2108   case Instruction::Trunc: {
2109     // Note: SExt/ZExt not used yet.
2110     Value *Op = State.get(getOperand(0), VPLane(0));
2111     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2112   }
2113   default:
2114     llvm_unreachable("opcode not implemented yet");
2115   }
2116 }
2117 
2118 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2119   State.set(this, generate(State), VPLane(0));
2120 }
2121 
2122 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2123 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2124                                 VPSlotTracker &SlotTracker) const {
2125   O << Indent << "SCALAR-CAST ";
2126   printAsOperand(O, SlotTracker);
2127   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2128   printOperands(O, SlotTracker);
2129   O << " to " << *ResultTy;
2130 }
2131 #endif
2132 
2133 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2134   assert(State.Lane && "Branch on Mask works only on single instance.");
2135 
2136   unsigned Lane = State.Lane->getKnownLane();
2137 
2138   Value *ConditionBit = nullptr;
2139   VPValue *BlockInMask = getMask();
2140   if (BlockInMask) {
2141     ConditionBit = State.get(BlockInMask);
2142     if (ConditionBit->getType()->isVectorTy())
2143       ConditionBit = State.Builder.CreateExtractElement(
2144           ConditionBit, State.Builder.getInt32(Lane));
2145   } else // Block in mask is all-one.
2146     ConditionBit = State.Builder.getTrue();
2147 
2148   // Replace the temporary unreachable terminator with a new conditional branch,
2149   // whose two destinations will be set later when they are created.
2150   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2151   assert(isa<UnreachableInst>(CurrentTerminator) &&
2152          "Expected to replace unreachable terminator with conditional branch.");
2153   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2154   CondBr->setSuccessor(0, nullptr);
2155   ReplaceInstWithInst(CurrentTerminator, CondBr);
2156 }
2157 
2158 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2159   assert(State.Lane && "Predicated instruction PHI works per instance.");
2160   Instruction *ScalarPredInst =
2161       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2162   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2163   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2164   assert(PredicatingBB && "Predicated block has no single predecessor.");
2165   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2166          "operand must be VPReplicateRecipe");
2167 
2168   // By current pack/unpack logic we need to generate only a single phi node: if
2169   // a vector value for the predicated instruction exists at this point it means
2170   // the instruction has vector users only, and a phi for the vector value is
2171   // needed. In this case the recipe of the predicated instruction is marked to
2172   // also do that packing, thereby "hoisting" the insert-element sequence.
2173   // Otherwise, a phi node for the scalar value is needed.
2174   if (State.hasVectorValue(getOperand(0))) {
2175     Value *VectorValue = State.get(getOperand(0));
2176     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2177     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2178     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2179     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2180     if (State.hasVectorValue(this))
2181       State.reset(this, VPhi);
2182     else
2183       State.set(this, VPhi);
2184     // NOTE: Currently we need to update the value of the operand, so the next
2185     // predicated iteration inserts its generated value in the correct vector.
2186     State.reset(getOperand(0), VPhi);
2187   } else {
2188     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2189     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2190     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2191                      PredicatingBB);
2192     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2193     if (State.hasScalarValue(this, *State.Lane))
2194       State.reset(this, Phi, *State.Lane);
2195     else
2196       State.set(this, Phi, *State.Lane);
2197     // NOTE: Currently we need to update the value of the operand, so the next
2198     // predicated iteration inserts its generated value in the correct vector.
2199     State.reset(getOperand(0), Phi, *State.Lane);
2200   }
2201 }
2202 
2203 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2204 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2205                                 VPSlotTracker &SlotTracker) const {
2206   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2207   printAsOperand(O, SlotTracker);
2208   O << " = ";
2209   printOperands(O, SlotTracker);
2210 }
2211 #endif
2212 
2213 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2214                                                  VPCostContext &Ctx) const {
2215   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2216   const Align Alignment =
2217       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2218   unsigned AS =
2219       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2220   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2221 
2222   if (!Consecutive) {
2223     // TODO: Using the original IR may not be accurate.
2224     // Currently, ARM will use the underlying IR to calculate gather/scatter
2225     // instruction cost.
2226     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2227     assert(!Reverse &&
2228            "Inconsecutive memory access should not have the order.");
2229     return Ctx.TTI.getAddressComputationCost(Ty) +
2230            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2231                                           IsMasked, Alignment, CostKind,
2232                                           &Ingredient);
2233   }
2234 
2235   InstructionCost Cost = 0;
2236   if (IsMasked) {
2237     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2238                                           AS, CostKind);
2239   } else {
2240     TTI::OperandValueInfo OpInfo =
2241         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2242     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2243                                     CostKind, OpInfo, &Ingredient);
2244   }
2245   if (!Reverse)
2246     return Cost;
2247 
2248   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2249                                         cast<VectorType>(Ty), {}, CostKind, 0);
2250 }
2251 
2252 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2253   auto *LI = cast<LoadInst>(&Ingredient);
2254 
2255   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2256   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2257   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2258   bool CreateGather = !isConsecutive();
2259 
2260   auto &Builder = State.Builder;
2261   State.setDebugLocFrom(getDebugLoc());
2262   Value *Mask = nullptr;
2263   if (auto *VPMask = getMask()) {
2264     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2265     // of a null all-one mask is a null mask.
2266     Mask = State.get(VPMask);
2267     if (isReverse())
2268       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2269   }
2270 
2271   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2272   Value *NewLI;
2273   if (CreateGather) {
2274     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2275                                        "wide.masked.gather");
2276   } else if (Mask) {
2277     NewLI =
2278         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2279                                  PoisonValue::get(DataTy), "wide.masked.load");
2280   } else {
2281     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2282   }
2283   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2284   State.addMetadata(NewLI, LI);
2285   if (Reverse)
2286     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2287   State.set(this, NewLI);
2288 }
2289 
2290 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2291 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2292                               VPSlotTracker &SlotTracker) const {
2293   O << Indent << "WIDEN ";
2294   printAsOperand(O, SlotTracker);
2295   O << " = load ";
2296   printOperands(O, SlotTracker);
2297 }
2298 #endif
2299 
2300 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2301 /// dependence w/o affecting the result.
2302 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2303                                      Value *EVL, const Twine &Name) {
2304   VectorType *ValTy = cast<VectorType>(Operand->getType());
2305   Value *AllTrueMask =
2306       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2307   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2308                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2309 }
2310 
2311 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2312   auto *LI = cast<LoadInst>(&Ingredient);
2313 
2314   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2315   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2316   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2317   bool CreateGather = !isConsecutive();
2318 
2319   auto &Builder = State.Builder;
2320   State.setDebugLocFrom(getDebugLoc());
2321   CallInst *NewLI;
2322   Value *EVL = State.get(getEVL(), VPLane(0));
2323   Value *Addr = State.get(getAddr(), !CreateGather);
2324   Value *Mask = nullptr;
2325   if (VPValue *VPMask = getMask()) {
2326     Mask = State.get(VPMask);
2327     if (isReverse())
2328       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2329   } else {
2330     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2331   }
2332 
2333   if (CreateGather) {
2334     NewLI =
2335         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2336                                 nullptr, "wide.masked.gather");
2337   } else {
2338     VectorBuilder VBuilder(Builder);
2339     VBuilder.setEVL(EVL).setMask(Mask);
2340     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2341         Instruction::Load, DataTy, Addr, "vp.op.load"));
2342   }
2343   NewLI->addParamAttr(
2344       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2345   State.addMetadata(NewLI, LI);
2346   Instruction *Res = NewLI;
2347   if (isReverse())
2348     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2349   State.set(this, Res);
2350 }
2351 
2352 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2353                                                   VPCostContext &Ctx) const {
2354   if (!Consecutive || IsMasked)
2355     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2356 
2357   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2358   // here because the EVL recipes using EVL to replace the tail mask. But in the
2359   // legacy model, it will always calculate the cost of mask.
2360   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2361   // don't need to compare to the legacy cost model.
2362   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2363   const Align Alignment =
2364       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2365   unsigned AS =
2366       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2367   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2368   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2369       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2370   if (!Reverse)
2371     return Cost;
2372 
2373   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2374                                        cast<VectorType>(Ty), {}, CostKind, 0);
2375 }
2376 
2377 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2378 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2379                                  VPSlotTracker &SlotTracker) const {
2380   O << Indent << "WIDEN ";
2381   printAsOperand(O, SlotTracker);
2382   O << " = vp.load ";
2383   printOperands(O, SlotTracker);
2384 }
2385 #endif
2386 
2387 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2388   auto *SI = cast<StoreInst>(&Ingredient);
2389 
2390   VPValue *StoredVPValue = getStoredValue();
2391   bool CreateScatter = !isConsecutive();
2392   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2393 
2394   auto &Builder = State.Builder;
2395   State.setDebugLocFrom(getDebugLoc());
2396 
2397   Value *Mask = nullptr;
2398   if (auto *VPMask = getMask()) {
2399     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2400     // of a null all-one mask is a null mask.
2401     Mask = State.get(VPMask);
2402     if (isReverse())
2403       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2404   }
2405 
2406   Value *StoredVal = State.get(StoredVPValue);
2407   if (isReverse()) {
2408     // If we store to reverse consecutive memory locations, then we need
2409     // to reverse the order of elements in the stored value.
2410     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2411     // We don't want to update the value in the map as it might be used in
2412     // another expression. So don't call resetVectorValue(StoredVal).
2413   }
2414   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2415   Instruction *NewSI = nullptr;
2416   if (CreateScatter)
2417     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2418   else if (Mask)
2419     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2420   else
2421     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2422   State.addMetadata(NewSI, SI);
2423 }
2424 
2425 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2426 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2427                                VPSlotTracker &SlotTracker) const {
2428   O << Indent << "WIDEN store ";
2429   printOperands(O, SlotTracker);
2430 }
2431 #endif
2432 
2433 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2434   auto *SI = cast<StoreInst>(&Ingredient);
2435 
2436   VPValue *StoredValue = getStoredValue();
2437   bool CreateScatter = !isConsecutive();
2438   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2439 
2440   auto &Builder = State.Builder;
2441   State.setDebugLocFrom(getDebugLoc());
2442 
2443   CallInst *NewSI = nullptr;
2444   Value *StoredVal = State.get(StoredValue);
2445   Value *EVL = State.get(getEVL(), VPLane(0));
2446   if (isReverse())
2447     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2448   Value *Mask = nullptr;
2449   if (VPValue *VPMask = getMask()) {
2450     Mask = State.get(VPMask);
2451     if (isReverse())
2452       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2453   } else {
2454     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2455   }
2456   Value *Addr = State.get(getAddr(), !CreateScatter);
2457   if (CreateScatter) {
2458     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2459                                     Intrinsic::vp_scatter,
2460                                     {StoredVal, Addr, Mask, EVL});
2461   } else {
2462     VectorBuilder VBuilder(Builder);
2463     VBuilder.setEVL(EVL).setMask(Mask);
2464     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2465         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2466         {StoredVal, Addr}));
2467   }
2468   NewSI->addParamAttr(
2469       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2470   State.addMetadata(NewSI, SI);
2471 }
2472 
2473 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2474                                                    VPCostContext &Ctx) const {
2475   if (!Consecutive || IsMasked)
2476     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2477 
2478   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2479   // here because the EVL recipes using EVL to replace the tail mask. But in the
2480   // legacy model, it will always calculate the cost of mask.
2481   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2482   // don't need to compare to the legacy cost model.
2483   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2484   const Align Alignment =
2485       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2486   unsigned AS =
2487       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2488   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2489   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2490       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2491   if (!Reverse)
2492     return Cost;
2493 
2494   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2495                                        cast<VectorType>(Ty), {}, CostKind, 0);
2496 }
2497 
2498 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2499 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2500                                   VPSlotTracker &SlotTracker) const {
2501   O << Indent << "WIDEN vp.store ";
2502   printOperands(O, SlotTracker);
2503 }
2504 #endif
2505 
2506 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2507                                      VectorType *DstVTy, const DataLayout &DL) {
2508   // Verify that V is a vector type with same number of elements as DstVTy.
2509   auto VF = DstVTy->getElementCount();
2510   auto *SrcVecTy = cast<VectorType>(V->getType());
2511   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2512   Type *SrcElemTy = SrcVecTy->getElementType();
2513   Type *DstElemTy = DstVTy->getElementType();
2514   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2515          "Vector elements must have same size");
2516 
2517   // Do a direct cast if element types are castable.
2518   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2519     return Builder.CreateBitOrPointerCast(V, DstVTy);
2520   }
2521   // V cannot be directly casted to desired vector type.
2522   // May happen when V is a floating point vector but DstVTy is a vector of
2523   // pointers or vice-versa. Handle this using a two-step bitcast using an
2524   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2525   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2526          "Only one type should be a pointer type");
2527   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2528          "Only one type should be a floating point type");
2529   Type *IntTy =
2530       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2531   auto *VecIntTy = VectorType::get(IntTy, VF);
2532   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2533   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2534 }
2535 
2536 /// Return a vector containing interleaved elements from multiple
2537 /// smaller input vectors.
2538 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2539                                 const Twine &Name) {
2540   unsigned Factor = Vals.size();
2541   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2542 
2543   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2544 #ifndef NDEBUG
2545   for (Value *Val : Vals)
2546     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2547 #endif
2548 
2549   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2550   // must use intrinsics to interleave.
2551   if (VecTy->isScalableTy()) {
2552     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2553     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2554                                    Vals,
2555                                    /*FMFSource=*/nullptr, Name);
2556   }
2557 
2558   // Fixed length. Start by concatenating all vectors into a wide vector.
2559   Value *WideVec = concatenateVectors(Builder, Vals);
2560 
2561   // Interleave the elements into the wide vector.
2562   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2563   return Builder.CreateShuffleVector(
2564       WideVec, createInterleaveMask(NumElts, Factor), Name);
2565 }
2566 
2567 // Try to vectorize the interleave group that \p Instr belongs to.
2568 //
2569 // E.g. Translate following interleaved load group (factor = 3):
2570 //   for (i = 0; i < N; i+=3) {
2571 //     R = Pic[i];             // Member of index 0
2572 //     G = Pic[i+1];           // Member of index 1
2573 //     B = Pic[i+2];           // Member of index 2
2574 //     ... // do something to R, G, B
2575 //   }
2576 // To:
2577 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2578 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2579 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2580 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2581 //
2582 // Or translate following interleaved store group (factor = 3):
2583 //   for (i = 0; i < N; i+=3) {
2584 //     ... do something to R, G, B
2585 //     Pic[i]   = R;           // Member of index 0
2586 //     Pic[i+1] = G;           // Member of index 1
2587 //     Pic[i+2] = B;           // Member of index 2
2588 //   }
2589 // To:
2590 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2591 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2592 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2593 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2594 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2595 void VPInterleaveRecipe::execute(VPTransformState &State) {
2596   assert(!State.Lane && "Interleave group being replicated.");
2597   const InterleaveGroup<Instruction> *Group = IG;
2598   Instruction *Instr = Group->getInsertPos();
2599 
2600   // Prepare for the vector type of the interleaved load/store.
2601   Type *ScalarTy = getLoadStoreType(Instr);
2602   unsigned InterleaveFactor = Group->getFactor();
2603   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2604 
2605   // Prepare for the new pointers.
2606   unsigned Index = Group->getIndex(Instr);
2607 
2608   // TODO: extend the masked interleaved-group support to reversed access.
2609   VPValue *BlockInMask = getMask();
2610   assert((!BlockInMask || !Group->isReverse()) &&
2611          "Reversed masked interleave-group not supported.");
2612 
2613   Value *Idx;
2614   // If the group is reverse, adjust the index to refer to the last vector lane
2615   // instead of the first. We adjust the index from the first vector lane,
2616   // rather than directly getting the pointer for lane VF - 1, because the
2617   // pointer operand of the interleaved access is supposed to be uniform.
2618   if (Group->isReverse()) {
2619     Value *RuntimeVF =
2620         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2621     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2622     Idx = State.Builder.CreateMul(Idx,
2623                                   State.Builder.getInt32(Group->getFactor()));
2624     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2625     Idx = State.Builder.CreateNeg(Idx);
2626   } else
2627     Idx = State.Builder.getInt32(-Index);
2628 
2629   VPValue *Addr = getAddr();
2630   Value *ResAddr = State.get(Addr, VPLane(0));
2631   if (auto *I = dyn_cast<Instruction>(ResAddr))
2632     State.setDebugLocFrom(I->getDebugLoc());
2633 
2634   // Notice current instruction could be any index. Need to adjust the address
2635   // to the member of index 0.
2636   //
2637   // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2638   //       b = A[i];       // Member of index 0
2639   // Current pointer is pointed to A[i+1], adjust it to A[i].
2640   //
2641   // E.g.  A[i+1] = a;     // Member of index 1
2642   //       A[i]   = b;     // Member of index 0
2643   //       A[i+2] = c;     // Member of index 2 (Current instruction)
2644   // Current pointer is pointed to A[i+2], adjust it to A[i].
2645 
2646   bool InBounds = false;
2647   if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2648     InBounds = gep->isInBounds();
2649   ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
2650 
2651   State.setDebugLocFrom(Instr->getDebugLoc());
2652   Value *PoisonVec = PoisonValue::get(VecTy);
2653 
2654   auto CreateGroupMask = [&BlockInMask, &State,
2655                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2656     if (State.VF.isScalable()) {
2657       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2658       assert(InterleaveFactor == 2 &&
2659              "Unsupported deinterleave factor for scalable vectors");
2660       auto *ResBlockInMask = State.get(BlockInMask);
2661       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2662       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2663                                      State.VF.getKnownMinValue() * 2, true);
2664       return State.Builder.CreateIntrinsic(
2665           MaskTy, Intrinsic::vector_interleave2, Ops,
2666           /*FMFSource=*/nullptr, "interleaved.mask");
2667     }
2668 
2669     if (!BlockInMask)
2670       return MaskForGaps;
2671 
2672     Value *ResBlockInMask = State.get(BlockInMask);
2673     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2674         ResBlockInMask,
2675         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2676         "interleaved.mask");
2677     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2678                                                    ShuffledMask, MaskForGaps)
2679                        : ShuffledMask;
2680   };
2681 
2682   const DataLayout &DL = Instr->getDataLayout();
2683   // Vectorize the interleaved load group.
2684   if (isa<LoadInst>(Instr)) {
2685     Value *MaskForGaps = nullptr;
2686     if (NeedsMaskForGaps) {
2687       MaskForGaps = createBitMaskForGaps(State.Builder,
2688                                          State.VF.getKnownMinValue(), *Group);
2689       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2690     }
2691 
2692     Instruction *NewLoad;
2693     if (BlockInMask || MaskForGaps) {
2694       Value *GroupMask = CreateGroupMask(MaskForGaps);
2695       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2696                                                Group->getAlign(), GroupMask,
2697                                                PoisonVec, "wide.masked.vec");
2698     } else
2699       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2700                                                 Group->getAlign(), "wide.vec");
2701     Group->addMetadata(NewLoad);
2702 
2703     ArrayRef<VPValue *> VPDefs = definedValues();
2704     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2705     if (VecTy->isScalableTy()) {
2706       assert(InterleaveFactor == 2 &&
2707              "Unsupported deinterleave factor for scalable vectors");
2708 
2709         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2710         // so must use intrinsics to deinterleave.
2711       Value *DI = State.Builder.CreateIntrinsic(
2712           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2713           /*FMFSource=*/nullptr, "strided.vec");
2714       unsigned J = 0;
2715       for (unsigned I = 0; I < InterleaveFactor; ++I) {
2716         Instruction *Member = Group->getMember(I);
2717 
2718         if (!Member)
2719           continue;
2720 
2721         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2722         // If this member has different type, cast the result type.
2723         if (Member->getType() != ScalarTy) {
2724           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2725           StridedVec =
2726               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2727         }
2728 
2729         if (Group->isReverse())
2730           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2731 
2732         State.set(VPDefs[J], StridedVec);
2733         ++J;
2734       }
2735 
2736       return;
2737     }
2738 
2739     // For each member in the group, shuffle out the appropriate data from the
2740     // wide loads.
2741     unsigned J = 0;
2742     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2743       Instruction *Member = Group->getMember(I);
2744 
2745       // Skip the gaps in the group.
2746       if (!Member)
2747         continue;
2748 
2749       auto StrideMask =
2750           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2751       Value *StridedVec =
2752           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
2753 
2754       // If this member has different type, cast the result type.
2755       if (Member->getType() != ScalarTy) {
2756         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2757         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2758         StridedVec =
2759             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2760       }
2761 
2762       if (Group->isReverse())
2763         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2764 
2765       State.set(VPDefs[J], StridedVec);
2766       ++J;
2767     }
2768     return;
2769   }
2770 
2771   // The sub vector type for current instruction.
2772   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2773 
2774   // Vectorize the interleaved store group.
2775   Value *MaskForGaps =
2776       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2777   assert((!MaskForGaps || !State.VF.isScalable()) &&
2778          "masking gaps for scalable vectors is not yet supported.");
2779   ArrayRef<VPValue *> StoredValues = getStoredValues();
2780   // Collect the stored vector from each member.
2781   SmallVector<Value *, 4> StoredVecs;
2782   unsigned StoredIdx = 0;
2783   for (unsigned i = 0; i < InterleaveFactor; i++) {
2784     assert((Group->getMember(i) || MaskForGaps) &&
2785            "Fail to get a member from an interleaved store group");
2786     Instruction *Member = Group->getMember(i);
2787 
2788     // Skip the gaps in the group.
2789     if (!Member) {
2790       Value *Undef = PoisonValue::get(SubVT);
2791       StoredVecs.push_back(Undef);
2792       continue;
2793     }
2794 
2795     Value *StoredVec = State.get(StoredValues[StoredIdx]);
2796     ++StoredIdx;
2797 
2798     if (Group->isReverse())
2799       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2800 
2801     // If this member has different type, cast it to a unified type.
2802 
2803     if (StoredVec->getType() != SubVT)
2804       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2805 
2806     StoredVecs.push_back(StoredVec);
2807   }
2808 
2809   // Interleave all the smaller vectors into one wider vector.
2810   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2811   Instruction *NewStoreInstr;
2812   if (BlockInMask || MaskForGaps) {
2813     Value *GroupMask = CreateGroupMask(MaskForGaps);
2814     NewStoreInstr = State.Builder.CreateMaskedStore(
2815         IVec, ResAddr, Group->getAlign(), GroupMask);
2816   } else
2817     NewStoreInstr =
2818         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
2819 
2820   Group->addMetadata(NewStoreInstr);
2821 }
2822 
2823 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2824 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2825                                VPSlotTracker &SlotTracker) const {
2826   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2827   IG->getInsertPos()->printAsOperand(O, false);
2828   O << ", ";
2829   getAddr()->printAsOperand(O, SlotTracker);
2830   VPValue *Mask = getMask();
2831   if (Mask) {
2832     O << ", ";
2833     Mask->printAsOperand(O, SlotTracker);
2834   }
2835 
2836   unsigned OpIdx = 0;
2837   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2838     if (!IG->getMember(i))
2839       continue;
2840     if (getNumStoreOperands() > 0) {
2841       O << "\n" << Indent << "  store ";
2842       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2843       O << " to index " << i;
2844     } else {
2845       O << "\n" << Indent << "  ";
2846       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2847       O << " = load from index " << i;
2848     }
2849     ++OpIdx;
2850   }
2851 }
2852 #endif
2853 
2854 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2855   Value *Start = getStartValue()->getLiveInIRValue();
2856   PHINode *Phi = PHINode::Create(Start->getType(), 2, "index");
2857   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2858 
2859   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2860   Phi->addIncoming(Start, VectorPH);
2861   Phi->setDebugLoc(getDebugLoc());
2862   State.set(this, Phi, /*IsScalar*/ true);
2863 }
2864 
2865 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2866 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2867                                    VPSlotTracker &SlotTracker) const {
2868   O << Indent << "EMIT ";
2869   printAsOperand(O, SlotTracker);
2870   O << " = CANONICAL-INDUCTION ";
2871   printOperands(O, SlotTracker);
2872 }
2873 #endif
2874 
2875 bool VPCanonicalIVPHIRecipe::isCanonical(
2876     InductionDescriptor::InductionKind Kind, VPValue *Start,
2877     VPValue *Step) const {
2878   // Must be an integer induction.
2879   if (Kind != InductionDescriptor::IK_IntInduction)
2880     return false;
2881   // Start must match the start value of this canonical induction.
2882   if (Start != getStartValue())
2883     return false;
2884 
2885   // If the step is defined by a recipe, it is not a ConstantInt.
2886   if (Step->getDefiningRecipe())
2887     return false;
2888 
2889   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2890   return StepC && StepC->isOne();
2891 }
2892 
2893 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2894   return IsScalarAfterVectorization &&
2895          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2896 }
2897 
2898 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2899   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2900          "Not a pointer induction according to InductionDescriptor!");
2901   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2902          "Unexpected type.");
2903   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2904          "Recipe should have been replaced");
2905 
2906   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2907   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
2908   unsigned CurrentPart = getUnrollPart(*this);
2909 
2910   // Build a pointer phi
2911   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2912   Type *ScStValueType = ScalarStartValue->getType();
2913 
2914   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2915   PHINode *NewPointerPhi = nullptr;
2916   if (CurrentPart == 0) {
2917     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2918                                     CanonicalIV->getIterator());
2919     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2920   } else {
2921     // The recipe has been unrolled. In that case, fetch the single pointer phi
2922     // shared among all unrolled parts of the recipe.
2923     auto *GEP =
2924         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
2925     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
2926   }
2927 
2928   // A pointer induction, performed by using a gep
2929   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2930   Value *ScalarStepValue = State.get(getOperand(1), VPLane(0));
2931   Type *PhiType = IndDesc.getStep()->getType();
2932   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2933   // Add induction update using an incorrect block temporarily. The phi node
2934   // will be fixed after VPlan execution. Note that at this point the latch
2935   // block cannot be used, as it does not exist yet.
2936   // TODO: Model increment value in VPlan, by turning the recipe into a
2937   // multi-def and a subclass of VPHeaderPHIRecipe.
2938   if (CurrentPart == 0) {
2939     // The recipe represents the first part of the pointer induction. Create the
2940     // GEP to increment the phi across all unrolled parts.
2941     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
2942     Value *NumUnrolledElems =
2943         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
2944 
2945     Value *InductionGEP = GetElementPtrInst::Create(
2946         State.Builder.getInt8Ty(), NewPointerPhi,
2947         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2948         InductionLoc);
2949 
2950     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2951   }
2952 
2953   // Create actual address geps that use the pointer phi as base and a
2954   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
2955   Type *VecPhiType = VectorType::get(PhiType, State.VF);
2956   Value *StartOffsetScalar = State.Builder.CreateMul(
2957       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
2958   Value *StartOffset =
2959       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2960   // Create a vector of consecutive numbers from zero to VF.
2961   StartOffset = State.Builder.CreateAdd(
2962       StartOffset, State.Builder.CreateStepVector(VecPhiType));
2963 
2964   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
2965          "scalar step must be the same across all parts");
2966   Value *GEP = State.Builder.CreateGEP(
2967       State.Builder.getInt8Ty(), NewPointerPhi,
2968       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
2969                                                State.VF, ScalarStepValue)),
2970       "vector.gep");
2971   State.set(this, GEP);
2972 }
2973 
2974 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2975 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2976                                           VPSlotTracker &SlotTracker) const {
2977   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
2978          "unexpected number of operands");
2979   O << Indent << "EMIT ";
2980   printAsOperand(O, SlotTracker);
2981   O << " = WIDEN-POINTER-INDUCTION ";
2982   getStartValue()->printAsOperand(O, SlotTracker);
2983   O << ", " << *IndDesc.getStep();
2984   if (getNumOperands() == 4) {
2985     O << ", ";
2986     getOperand(2)->printAsOperand(O, SlotTracker);
2987     O << ", ";
2988     getOperand(3)->printAsOperand(O, SlotTracker);
2989   }
2990 }
2991 #endif
2992 
2993 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2994   assert(!State.Lane && "cannot be used in per-lane");
2995   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2996   SCEVExpander Exp(SE, DL, "induction");
2997 
2998   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2999                                  &*State.Builder.GetInsertPoint());
3000   assert(!State.ExpandedSCEVs.contains(Expr) &&
3001          "Same SCEV expanded multiple times");
3002   State.ExpandedSCEVs[Expr] = Res;
3003   State.set(this, Res, VPLane(0));
3004 }
3005 
3006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3007 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3008                                VPSlotTracker &SlotTracker) const {
3009   O << Indent << "EMIT ";
3010   getVPSingleValue()->printAsOperand(O, SlotTracker);
3011   O << " = EXPAND SCEV " << *Expr;
3012 }
3013 #endif
3014 
3015 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3016   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3017   Type *STy = CanonicalIV->getType();
3018   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3019   ElementCount VF = State.VF;
3020   Value *VStart = VF.isScalar()
3021                       ? CanonicalIV
3022                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3023   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3024   if (VF.isVector()) {
3025     VStep = Builder.CreateVectorSplat(VF, VStep);
3026     VStep =
3027         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3028   }
3029   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3030   State.set(this, CanonicalVectorIV);
3031 }
3032 
3033 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3034 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3035                                      VPSlotTracker &SlotTracker) const {
3036   O << Indent << "EMIT ";
3037   printAsOperand(O, SlotTracker);
3038   O << " = WIDEN-CANONICAL-INDUCTION ";
3039   printOperands(O, SlotTracker);
3040 }
3041 #endif
3042 
3043 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3044   auto &Builder = State.Builder;
3045   // Create a vector from the initial value.
3046   auto *VectorInit = getStartValue()->getLiveInIRValue();
3047 
3048   Type *VecTy = State.VF.isScalar()
3049                     ? VectorInit->getType()
3050                     : VectorType::get(VectorInit->getType(), State.VF);
3051 
3052   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3053   if (State.VF.isVector()) {
3054     auto *IdxTy = Builder.getInt32Ty();
3055     auto *One = ConstantInt::get(IdxTy, 1);
3056     IRBuilder<>::InsertPointGuard Guard(Builder);
3057     Builder.SetInsertPoint(VectorPH->getTerminator());
3058     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3059     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3060     VectorInit = Builder.CreateInsertElement(
3061         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3062   }
3063 
3064   // Create a phi node for the new recurrence.
3065   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3066   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3067   Phi->addIncoming(VectorInit, VectorPH);
3068   State.set(this, Phi);
3069 }
3070 
3071 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3072 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3073                                             VPSlotTracker &SlotTracker) const {
3074   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3075   printAsOperand(O, SlotTracker);
3076   O << " = phi ";
3077   printOperands(O, SlotTracker);
3078 }
3079 #endif
3080 
3081 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3082   auto &Builder = State.Builder;
3083 
3084   // Reductions do not have to start at zero. They can start with
3085   // any loop invariant values.
3086   VPValue *StartVPV = getStartValue();
3087   Value *StartV = StartVPV->getLiveInIRValue();
3088 
3089   // In order to support recurrences we need to be able to vectorize Phi nodes.
3090   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3091   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3092   // this value when we vectorize all of the instructions that use the PHI.
3093   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3094   Type *VecTy = ScalarPHI ? StartV->getType()
3095                           : VectorType::get(StartV->getType(), State.VF);
3096 
3097   BasicBlock *HeaderBB = State.CFG.PrevBB;
3098   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3099          "recipe must be in the vector loop header");
3100   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3101   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3102   State.set(this, Phi, IsInLoop);
3103 
3104   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3105 
3106   Value *Iden = nullptr;
3107   RecurKind RK = RdxDesc.getRecurrenceKind();
3108   unsigned CurrentPart = getUnrollPart(*this);
3109 
3110   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3111       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3112     // MinMax and AnyOf reductions have the start value as their identity.
3113     if (ScalarPHI) {
3114       Iden = StartV;
3115     } else {
3116       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3117       Builder.SetInsertPoint(VectorPH->getTerminator());
3118       StartV = Iden = State.get(StartVPV);
3119     }
3120   } else {
3121     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3122                                        RdxDesc.getFastMathFlags());
3123 
3124     if (!ScalarPHI) {
3125       if (CurrentPart == 0) {
3126         // Create start and identity vector values for the reduction in the
3127         // preheader.
3128         // TODO: Introduce recipes in VPlan preheader to create initial values.
3129         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3130         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3131         Builder.SetInsertPoint(VectorPH->getTerminator());
3132         Constant *Zero = Builder.getInt32(0);
3133         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3134       } else {
3135         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3136       }
3137     }
3138   }
3139 
3140   Phi = cast<PHINode>(State.get(this, IsInLoop));
3141   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3142   Phi->addIncoming(StartVal, VectorPH);
3143 }
3144 
3145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3146 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3147                                  VPSlotTracker &SlotTracker) const {
3148   O << Indent << "WIDEN-REDUCTION-PHI ";
3149 
3150   printAsOperand(O, SlotTracker);
3151   O << " = phi ";
3152   printOperands(O, SlotTracker);
3153 }
3154 #endif
3155 
3156 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3157   assert(EnableVPlanNativePath &&
3158          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3159 
3160   Value *Op0 = State.get(getOperand(0));
3161   Type *VecTy = Op0->getType();
3162   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3163   State.set(this, VecPhi);
3164 }
3165 
3166 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3167 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3168                              VPSlotTracker &SlotTracker) const {
3169   O << Indent << "WIDEN-PHI ";
3170 
3171   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3172   // Unless all incoming values are modeled in VPlan  print the original PHI
3173   // directly.
3174   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3175   // values as VPValues.
3176   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3177     O << VPlanIngredient(OriginalPhi);
3178     return;
3179   }
3180 
3181   printAsOperand(O, SlotTracker);
3182   O << " = phi ";
3183   printOperands(O, SlotTracker);
3184 }
3185 #endif
3186 
3187 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3188 // remove VPActiveLaneMaskPHIRecipe.
3189 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3190   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3191   Value *StartMask = State.get(getOperand(0));
3192   PHINode *Phi =
3193       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3194   Phi->addIncoming(StartMask, VectorPH);
3195   Phi->setDebugLoc(getDebugLoc());
3196   State.set(this, Phi);
3197 }
3198 
3199 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3200 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3201                                       VPSlotTracker &SlotTracker) const {
3202   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3203 
3204   printAsOperand(O, SlotTracker);
3205   O << " = phi ";
3206   printOperands(O, SlotTracker);
3207 }
3208 #endif
3209 
3210 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3211   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3212   Value *Start = State.get(getOperand(0), VPLane(0));
3213   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3214   Phi->addIncoming(Start, VectorPH);
3215   Phi->setDebugLoc(getDebugLoc());
3216   State.set(this, Phi, /*IsScalar=*/true);
3217 }
3218 
3219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3220 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3221                                   VPSlotTracker &SlotTracker) const {
3222   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3223 
3224   printAsOperand(O, SlotTracker);
3225   O << " = phi ";
3226   printOperands(O, SlotTracker);
3227 }
3228 #endif
3229