xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision a9bafe91dd088c5fa6f074c14dd3a1af25f00457)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenStoreSC:
51     return true;
52   case VPReplicateSC:
53   case VPWidenCallSC:
54     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
55         ->mayWriteToMemory();
56   case VPBranchOnMaskSC:
57   case VPScalarIVStepsSC:
58   case VPPredInstPHISC:
59     return false;
60   case VPBlendSC:
61   case VPReductionSC:
62   case VPWidenCanonicalIVSC:
63   case VPWidenCastSC:
64   case VPWidenGEPSC:
65   case VPWidenIntOrFpInductionSC:
66   case VPWidenLoadSC:
67   case VPWidenPHISC:
68   case VPWidenSC:
69   case VPWidenSelectSC: {
70     const Instruction *I =
71         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
72     (void)I;
73     assert((!I || !I->mayWriteToMemory()) &&
74            "underlying instruction may write to memory");
75     return false;
76   }
77   default:
78     return true;
79   }
80 }
81 
82 bool VPRecipeBase::mayReadFromMemory() const {
83   switch (getVPDefID()) {
84   case VPWidenLoadSC:
85     return true;
86   case VPReplicateSC:
87   case VPWidenCallSC:
88     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
89         ->mayReadFromMemory();
90   case VPBranchOnMaskSC:
91   case VPPredInstPHISC:
92   case VPScalarIVStepsSC:
93   case VPWidenStoreSC:
94     return false;
95   case VPBlendSC:
96   case VPReductionSC:
97   case VPWidenCanonicalIVSC:
98   case VPWidenCastSC:
99   case VPWidenGEPSC:
100   case VPWidenIntOrFpInductionSC:
101   case VPWidenPHISC:
102   case VPWidenSC:
103   case VPWidenSelectSC: {
104     const Instruction *I =
105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106     (void)I;
107     assert((!I || !I->mayReadFromMemory()) &&
108            "underlying instruction may read from memory");
109     return false;
110   }
111   default:
112     return true;
113   }
114 }
115 
116 bool VPRecipeBase::mayHaveSideEffects() const {
117   switch (getVPDefID()) {
118   case VPDerivedIVSC:
119   case VPPredInstPHISC:
120   case VPScalarCastSC:
121     return false;
122   case VPInstructionSC:
123     switch (cast<VPInstruction>(this)->getOpcode()) {
124     case Instruction::Or:
125     case Instruction::ICmp:
126     case Instruction::Select:
127     case VPInstruction::Not:
128     case VPInstruction::CalculateTripCountMinusVF:
129     case VPInstruction::CanonicalIVIncrementForPart:
130     case VPInstruction::PtrAdd:
131       return false;
132     default:
133       return true;
134     }
135   case VPWidenCallSC:
136     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
137         ->mayHaveSideEffects();
138   case VPBlendSC:
139   case VPReductionSC:
140   case VPScalarIVStepsSC:
141   case VPWidenCanonicalIVSC:
142   case VPWidenCastSC:
143   case VPWidenGEPSC:
144   case VPWidenIntOrFpInductionSC:
145   case VPWidenPHISC:
146   case VPWidenPointerInductionSC:
147   case VPWidenSC:
148   case VPWidenSelectSC: {
149     const Instruction *I =
150         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
151     (void)I;
152     assert((!I || !I->mayHaveSideEffects()) &&
153            "underlying instruction has side-effects");
154     return false;
155   }
156   case VPInterleaveSC:
157     return mayWriteToMemory();
158   case VPWidenLoadSC:
159   case VPWidenStoreSC:
160     assert(
161         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
162             mayWriteToMemory() &&
163         "mayHaveSideffects result for ingredient differs from this "
164         "implementation");
165     return mayWriteToMemory();
166   case VPReplicateSC: {
167     auto *R = cast<VPReplicateRecipe>(this);
168     return R->getUnderlyingInstr()->mayHaveSideEffects();
169   }
170   default:
171     return true;
172   }
173 }
174 
175 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
176   auto Lane = VPLane::getLastLaneForVF(State.VF);
177   VPValue *ExitValue = getOperand(0);
178   if (vputils::isUniformAfterVectorization(ExitValue))
179     Lane = VPLane::getFirstLane();
180   VPBasicBlock *MiddleVPBB =
181       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
182   assert(MiddleVPBB->getNumSuccessors() == 0 &&
183          "the middle block must not have any successors");
184   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
185   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
186                    MiddleBB);
187 }
188 
189 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
190 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
191   O << "Live-out ";
192   getPhi()->printAsOperand(O);
193   O << " = ";
194   getOperand(0)->printAsOperand(O, SlotTracker);
195   O << "\n";
196 }
197 #endif
198 
199 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
200   assert(!Parent && "Recipe already in some VPBasicBlock");
201   assert(InsertPos->getParent() &&
202          "Insertion position not in any VPBasicBlock");
203   InsertPos->getParent()->insert(this, InsertPos->getIterator());
204 }
205 
206 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
207                                 iplist<VPRecipeBase>::iterator I) {
208   assert(!Parent && "Recipe already in some VPBasicBlock");
209   assert(I == BB.end() || I->getParent() == &BB);
210   BB.insert(this, I);
211 }
212 
213 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
214   assert(!Parent && "Recipe already in some VPBasicBlock");
215   assert(InsertPos->getParent() &&
216          "Insertion position not in any VPBasicBlock");
217   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
218 }
219 
220 void VPRecipeBase::removeFromParent() {
221   assert(getParent() && "Recipe not in any VPBasicBlock");
222   getParent()->getRecipeList().remove(getIterator());
223   Parent = nullptr;
224 }
225 
226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
227   assert(getParent() && "Recipe not in any VPBasicBlock");
228   return getParent()->getRecipeList().erase(getIterator());
229 }
230 
231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
232   removeFromParent();
233   insertAfter(InsertPos);
234 }
235 
236 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
237                               iplist<VPRecipeBase>::iterator I) {
238   removeFromParent();
239   insertBefore(BB, I);
240 }
241 
242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
243   assert(OpType == OperationType::FPMathOp &&
244          "recipe doesn't have fast math flags");
245   FastMathFlags Res;
246   Res.setAllowReassoc(FMFs.AllowReassoc);
247   Res.setNoNaNs(FMFs.NoNaNs);
248   Res.setNoInfs(FMFs.NoInfs);
249   Res.setNoSignedZeros(FMFs.NoSignedZeros);
250   Res.setAllowReciprocal(FMFs.AllowReciprocal);
251   Res.setAllowContract(FMFs.AllowContract);
252   Res.setApproxFunc(FMFs.ApproxFunc);
253   return Res;
254 }
255 
256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
257                              VPValue *A, VPValue *B, DebugLoc DL,
258                              const Twine &Name)
259     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
260                           Pred, DL),
261       Opcode(Opcode), Name(Name.str()) {
262   assert(Opcode == Instruction::ICmp &&
263          "only ICmp predicates supported at the moment");
264 }
265 
266 VPInstruction::VPInstruction(unsigned Opcode,
267                              std::initializer_list<VPValue *> Operands,
268                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
269     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
270       Opcode(Opcode), Name(Name.str()) {
271   // Make sure the VPInstruction is a floating-point operation.
272   assert(isFPMathOp() && "this op can't take fast-math flags");
273 }
274 
275 bool VPInstruction::doesGeneratePerAllLanes() const {
276   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
277 }
278 
279 bool VPInstruction::canGenerateScalarForFirstLane() const {
280   if (Instruction::isBinaryOp(getOpcode()))
281     return true;
282 
283   switch (Opcode) {
284   case VPInstruction::BranchOnCond:
285   case VPInstruction::BranchOnCount:
286   case VPInstruction::CalculateTripCountMinusVF:
287   case VPInstruction::CanonicalIVIncrementForPart:
288   case VPInstruction::ComputeReductionResult:
289   case VPInstruction::PtrAdd:
290   case VPInstruction::ExplicitVectorLength:
291     return true;
292   default:
293     return false;
294   }
295 }
296 
297 Value *VPInstruction::generatePerLane(VPTransformState &State,
298                                       const VPIteration &Lane) {
299   IRBuilderBase &Builder = State.Builder;
300 
301   assert(getOpcode() == VPInstruction::PtrAdd &&
302          "only PtrAdd opcodes are supported for now");
303   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
304                               State.get(getOperand(1), Lane), Name);
305 }
306 
307 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
308   IRBuilderBase &Builder = State.Builder;
309 
310   if (Instruction::isBinaryOp(getOpcode())) {
311     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
312     if (Part != 0 && vputils::onlyFirstPartUsed(this))
313       return State.get(this, 0, OnlyFirstLaneUsed);
314 
315     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
316     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
317     auto *Res =
318         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
319     if (auto *I = dyn_cast<Instruction>(Res))
320       setFlags(I);
321     return Res;
322   }
323 
324   switch (getOpcode()) {
325   case VPInstruction::Not: {
326     Value *A = State.get(getOperand(0), Part);
327     return Builder.CreateNot(A, Name);
328   }
329   case Instruction::ICmp: {
330     Value *A = State.get(getOperand(0), Part);
331     Value *B = State.get(getOperand(1), Part);
332     return Builder.CreateCmp(getPredicate(), A, B, Name);
333   }
334   case Instruction::Select: {
335     Value *Cond = State.get(getOperand(0), Part);
336     Value *Op1 = State.get(getOperand(1), Part);
337     Value *Op2 = State.get(getOperand(2), Part);
338     return Builder.CreateSelect(Cond, Op1, Op2, Name);
339   }
340   case VPInstruction::ActiveLaneMask: {
341     // Get first lane of vector induction variable.
342     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
343     // Get the original loop tripcount.
344     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
345 
346     // If this part of the active lane mask is scalar, generate the CMP directly
347     // to avoid unnecessary extracts.
348     if (State.VF.isScalar())
349       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
350                                Name);
351 
352     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
353     auto *PredTy = VectorType::get(Int1Ty, State.VF);
354     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
355                                    {PredTy, ScalarTC->getType()},
356                                    {VIVElem0, ScalarTC}, nullptr, Name);
357   }
358   case VPInstruction::FirstOrderRecurrenceSplice: {
359     // Generate code to combine the previous and current values in vector v3.
360     //
361     //   vector.ph:
362     //     v_init = vector(..., ..., ..., a[-1])
363     //     br vector.body
364     //
365     //   vector.body
366     //     i = phi [0, vector.ph], [i+4, vector.body]
367     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
368     //     v2 = a[i, i+1, i+2, i+3];
369     //     v3 = vector(v1(3), v2(0, 1, 2))
370 
371     // For the first part, use the recurrence phi (v1), otherwise v2.
372     auto *V1 = State.get(getOperand(0), 0);
373     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
374     if (!PartMinus1->getType()->isVectorTy())
375       return PartMinus1;
376     Value *V2 = State.get(getOperand(1), Part);
377     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
378   }
379   case VPInstruction::CalculateTripCountMinusVF: {
380     if (Part != 0)
381       return State.get(this, 0, /*IsScalar*/ true);
382 
383     Value *ScalarTC = State.get(getOperand(0), {0, 0});
384     Value *Step =
385         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
386     Value *Sub = Builder.CreateSub(ScalarTC, Step);
387     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
388     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
389     return Builder.CreateSelect(Cmp, Sub, Zero);
390   }
391   case VPInstruction::ExplicitVectorLength: {
392     // Compute EVL
393     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
394       assert(AVL->getType()->isIntegerTy() &&
395              "Requested vector length should be an integer.");
396 
397       // TODO: Add support for MaxSafeDist for correct loop emission.
398       assert(State.VF.isScalable() && "Expected scalable vector factor.");
399       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
400 
401       Value *EVL = State.Builder.CreateIntrinsic(
402           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
403           {AVL, VFArg, State.Builder.getTrue()});
404       return EVL;
405     };
406     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
407     // be outside of the main loop.
408     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
409     // Compute VTC - IV as the AVL (requested vector length).
410     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
411     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
412     Value *AVL = State.Builder.CreateSub(TripCount, Index);
413     Value *EVL = GetEVL(State, AVL);
414     assert(!State.EVL && "multiple EVL recipes");
415     State.EVL = this;
416     return EVL;
417   }
418   case VPInstruction::CanonicalIVIncrementForPart: {
419     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
420     if (Part == 0)
421       return IV;
422 
423     // The canonical IV is incremented by the vectorization factor (num of SIMD
424     // elements) times the unroll part.
425     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
426     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
427                              hasNoSignedWrap());
428   }
429   case VPInstruction::BranchOnCond: {
430     if (Part != 0)
431       return nullptr;
432 
433     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
434     VPRegionBlock *ParentRegion = getParent()->getParent();
435     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
436 
437     // Replace the temporary unreachable terminator with a new conditional
438     // branch, hooking it up to backward destination for exiting blocks now and
439     // to forward destination(s) later when they are created.
440     BranchInst *CondBr =
441         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
442 
443     if (getParent()->isExiting())
444       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
445 
446     CondBr->setSuccessor(0, nullptr);
447     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
448     return CondBr;
449   }
450   case VPInstruction::BranchOnCount: {
451     if (Part != 0)
452       return nullptr;
453     // First create the compare.
454     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
455     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
456     Value *Cond = Builder.CreateICmpEQ(IV, TC);
457 
458     // Now create the branch.
459     auto *Plan = getParent()->getPlan();
460     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
461     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
462 
463     // Replace the temporary unreachable terminator with a new conditional
464     // branch, hooking it up to backward destination (the header) now and to the
465     // forward destination (the exit/middle block) later when it is created.
466     // Note that CreateCondBr expects a valid BB as first argument, so we need
467     // to set it to nullptr later.
468     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
469                                               State.CFG.VPBB2IRBB[Header]);
470     CondBr->setSuccessor(0, nullptr);
471     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
472     return CondBr;
473   }
474   case VPInstruction::ComputeReductionResult: {
475     if (Part != 0)
476       return State.get(this, 0, /*IsScalar*/ true);
477 
478     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
479     // and will be removed by breaking up the recipe further.
480     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
481     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
482     // Get its reduction variable descriptor.
483     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
484 
485     RecurKind RK = RdxDesc.getRecurrenceKind();
486 
487     VPValue *LoopExitingDef = getOperand(1);
488     Type *PhiTy = OrigPhi->getType();
489     VectorParts RdxParts(State.UF);
490     for (unsigned Part = 0; Part < State.UF; ++Part)
491       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
492 
493     // If the vector reduction can be performed in a smaller type, we truncate
494     // then extend the loop exit value to enable InstCombine to evaluate the
495     // entire expression in the smaller type.
496     // TODO: Handle this in truncateToMinBW.
497     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
498       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
499       for (unsigned Part = 0; Part < State.UF; ++Part)
500         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
501     }
502     // Reduce all of the unrolled parts into a single vector.
503     Value *ReducedPartRdx = RdxParts[0];
504     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
505 
506     if (PhiR->isOrdered()) {
507       ReducedPartRdx = RdxParts[State.UF - 1];
508     } else {
509       // Floating-point operations should have some FMF to enable the reduction.
510       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
511       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
512       for (unsigned Part = 1; Part < State.UF; ++Part) {
513         Value *RdxPart = RdxParts[Part];
514         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
515           ReducedPartRdx = Builder.CreateBinOp(
516               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
517         else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
518           TrackingVH<Value> ReductionStartValue =
519               RdxDesc.getRecurrenceStartValue();
520           ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
521                                          ReducedPartRdx, RdxPart);
522         } else
523           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
524       }
525     }
526 
527     // Create the reduction after the loop. Note that inloop reductions create
528     // the target reduction in the loop using a Reduction recipe.
529     if (State.VF.isVector() && !PhiR->isInLoop()) {
530       ReducedPartRdx =
531           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
532       // If the reduction can be performed in a smaller type, we need to extend
533       // the reduction to the wider type before we branch to the original loop.
534       if (PhiTy != RdxDesc.getRecurrenceType())
535         ReducedPartRdx = RdxDesc.isSigned()
536                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
537                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
538     }
539 
540     // If there were stores of the reduction value to a uniform memory address
541     // inside the loop, create the final store here.
542     if (StoreInst *SI = RdxDesc.IntermediateStore) {
543       auto *NewSI = Builder.CreateAlignedStore(
544           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
545       propagateMetadata(NewSI, SI);
546     }
547 
548     return ReducedPartRdx;
549   }
550   case VPInstruction::PtrAdd: {
551     assert(vputils::onlyFirstLaneUsed(this) &&
552            "can only generate first lane for PtrAdd");
553     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
554     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
555     return Builder.CreatePtrAdd(Ptr, Addend, Name);
556   }
557   default:
558     llvm_unreachable("Unsupported opcode for instruction");
559   }
560 }
561 
562 #if !defined(NDEBUG)
563 bool VPInstruction::isFPMathOp() const {
564   // Inspired by FPMathOperator::classof. Notable differences are that we don't
565   // support Call, PHI and Select opcodes here yet.
566   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
567          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
568          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
569          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
570 }
571 #endif
572 
573 void VPInstruction::execute(VPTransformState &State) {
574   assert(!State.Instance && "VPInstruction executing an Instance");
575   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
576   assert((hasFastMathFlags() == isFPMathOp() ||
577           getOpcode() == Instruction::Select) &&
578          "Recipe not a FPMathOp but has fast-math flags?");
579   if (hasFastMathFlags())
580     State.Builder.setFastMathFlags(getFastMathFlags());
581   State.setDebugLocFrom(getDebugLoc());
582   bool GeneratesPerFirstLaneOnly =
583       canGenerateScalarForFirstLane() &&
584       (vputils::onlyFirstLaneUsed(this) ||
585        getOpcode() == VPInstruction::ComputeReductionResult);
586   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
587   for (unsigned Part = 0; Part < State.UF; ++Part) {
588     if (GeneratesPerAllLanes) {
589       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
590            Lane != NumLanes; ++Lane) {
591         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
592         assert(GeneratedValue && "generatePerLane must produce a value");
593         State.set(this, GeneratedValue, VPIteration(Part, Lane));
594       }
595       continue;
596     }
597 
598     Value *GeneratedValue = generatePerPart(State, Part);
599     if (!hasResult())
600       continue;
601     assert(GeneratedValue && "generatePerPart must produce a value");
602     assert((GeneratedValue->getType()->isVectorTy() ==
603                 !GeneratesPerFirstLaneOnly ||
604             State.VF.isScalar()) &&
605            "scalar value but not only first lane defined");
606     State.set(this, GeneratedValue, Part,
607               /*IsScalar*/ GeneratesPerFirstLaneOnly);
608   }
609 }
610 
611 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
612   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
613   if (Instruction::isBinaryOp(getOpcode()))
614     return vputils::onlyFirstLaneUsed(this);
615 
616   switch (getOpcode()) {
617   default:
618     return false;
619   case Instruction::ICmp:
620   case VPInstruction::PtrAdd:
621     // TODO: Cover additional opcodes.
622     return vputils::onlyFirstLaneUsed(this);
623   case VPInstruction::ActiveLaneMask:
624   case VPInstruction::ExplicitVectorLength:
625   case VPInstruction::CalculateTripCountMinusVF:
626   case VPInstruction::CanonicalIVIncrementForPart:
627   case VPInstruction::BranchOnCount:
628     return true;
629   };
630   llvm_unreachable("switch should return");
631 }
632 
633 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
634 void VPInstruction::dump() const {
635   VPSlotTracker SlotTracker(getParent()->getPlan());
636   print(dbgs(), "", SlotTracker);
637 }
638 
639 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
640                           VPSlotTracker &SlotTracker) const {
641   O << Indent << "EMIT ";
642 
643   if (hasResult()) {
644     printAsOperand(O, SlotTracker);
645     O << " = ";
646   }
647 
648   switch (getOpcode()) {
649   case VPInstruction::Not:
650     O << "not";
651     break;
652   case VPInstruction::SLPLoad:
653     O << "combined load";
654     break;
655   case VPInstruction::SLPStore:
656     O << "combined store";
657     break;
658   case VPInstruction::ActiveLaneMask:
659     O << "active lane mask";
660     break;
661   case VPInstruction::ExplicitVectorLength:
662     O << "EXPLICIT-VECTOR-LENGTH";
663     break;
664   case VPInstruction::FirstOrderRecurrenceSplice:
665     O << "first-order splice";
666     break;
667   case VPInstruction::BranchOnCond:
668     O << "branch-on-cond";
669     break;
670   case VPInstruction::CalculateTripCountMinusVF:
671     O << "TC > VF ? TC - VF : 0";
672     break;
673   case VPInstruction::CanonicalIVIncrementForPart:
674     O << "VF * Part +";
675     break;
676   case VPInstruction::BranchOnCount:
677     O << "branch-on-count";
678     break;
679   case VPInstruction::ComputeReductionResult:
680     O << "compute-reduction-result";
681     break;
682   case VPInstruction::PtrAdd:
683     O << "ptradd";
684     break;
685   default:
686     O << Instruction::getOpcodeName(getOpcode());
687   }
688 
689   printFlags(O);
690   printOperands(O, SlotTracker);
691 
692   if (auto DL = getDebugLoc()) {
693     O << ", !dbg ";
694     DL.print(O);
695   }
696 }
697 #endif
698 
699 void VPWidenCallRecipe::execute(VPTransformState &State) {
700   assert(State.VF.isVector() && "not widening");
701   auto &CI = *cast<CallInst>(getUnderlyingInstr());
702   assert(!isa<DbgInfoIntrinsic>(CI) &&
703          "DbgInfoIntrinsic should have been dropped during VPlan construction");
704   State.setDebugLocFrom(getDebugLoc());
705 
706   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
707   FunctionType *VFTy = nullptr;
708   if (Variant)
709     VFTy = Variant->getFunctionType();
710   for (unsigned Part = 0; Part < State.UF; ++Part) {
711     SmallVector<Type *, 2> TysForDecl;
712     // Add return type if intrinsic is overloaded on it.
713     if (UseIntrinsic &&
714         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
715       TysForDecl.push_back(
716           VectorType::get(CI.getType()->getScalarType(), State.VF));
717     SmallVector<Value *, 4> Args;
718     for (const auto &I : enumerate(operands())) {
719       // Some intrinsics have a scalar argument - don't replace it with a
720       // vector.
721       Value *Arg;
722       if (UseIntrinsic &&
723           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
724         Arg = State.get(I.value(), VPIteration(0, 0));
725       // Some vectorized function variants may also take a scalar argument,
726       // e.g. linear parameters for pointers. This needs to be the scalar value
727       // from the start of the respective part when interleaving.
728       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
729         Arg = State.get(I.value(), VPIteration(Part, 0));
730       else
731         Arg = State.get(I.value(), Part);
732       if (UseIntrinsic &&
733           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
734         TysForDecl.push_back(Arg->getType());
735       Args.push_back(Arg);
736     }
737 
738     Function *VectorF;
739     if (UseIntrinsic) {
740       // Use vector version of the intrinsic.
741       Module *M = State.Builder.GetInsertBlock()->getModule();
742       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
743       assert(VectorF && "Can't retrieve vector intrinsic.");
744     } else {
745 #ifndef NDEBUG
746       assert(Variant != nullptr && "Can't create vector function.");
747 #endif
748       VectorF = Variant;
749     }
750 
751     SmallVector<OperandBundleDef, 1> OpBundles;
752     CI.getOperandBundlesAsDefs(OpBundles);
753     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
754 
755     if (isa<FPMathOperator>(V))
756       V->copyFastMathFlags(&CI);
757 
758     if (!V->getType()->isVoidTy())
759       State.set(this, V, Part);
760     State.addMetadata(V, &CI);
761   }
762 }
763 
764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
765 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
766                               VPSlotTracker &SlotTracker) const {
767   O << Indent << "WIDEN-CALL ";
768 
769   auto *CI = cast<CallInst>(getUnderlyingInstr());
770   if (CI->getType()->isVoidTy())
771     O << "void ";
772   else {
773     printAsOperand(O, SlotTracker);
774     O << " = ";
775   }
776 
777   O << "call @" << CI->getCalledFunction()->getName() << "(";
778   printOperands(O, SlotTracker);
779   O << ")";
780 
781   if (VectorIntrinsicID)
782     O << " (using vector intrinsic)";
783   else {
784     O << " (using library function";
785     if (Variant->hasName())
786       O << ": " << Variant->getName();
787     O << ")";
788   }
789 }
790 
791 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
792                                 VPSlotTracker &SlotTracker) const {
793   O << Indent << "WIDEN-SELECT ";
794   printAsOperand(O, SlotTracker);
795   O << " = select ";
796   getOperand(0)->printAsOperand(O, SlotTracker);
797   O << ", ";
798   getOperand(1)->printAsOperand(O, SlotTracker);
799   O << ", ";
800   getOperand(2)->printAsOperand(O, SlotTracker);
801   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
802 }
803 #endif
804 
805 void VPWidenSelectRecipe::execute(VPTransformState &State) {
806   State.setDebugLocFrom(getDebugLoc());
807 
808   // The condition can be loop invariant but still defined inside the
809   // loop. This means that we can't just use the original 'cond' value.
810   // We have to take the 'vectorized' value and pick the first lane.
811   // Instcombine will make this a no-op.
812   auto *InvarCond =
813       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
814 
815   for (unsigned Part = 0; Part < State.UF; ++Part) {
816     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
817     Value *Op0 = State.get(getOperand(1), Part);
818     Value *Op1 = State.get(getOperand(2), Part);
819     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
820     State.set(this, Sel, Part);
821     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
822   }
823 }
824 
825 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
826     const FastMathFlags &FMF) {
827   AllowReassoc = FMF.allowReassoc();
828   NoNaNs = FMF.noNaNs();
829   NoInfs = FMF.noInfs();
830   NoSignedZeros = FMF.noSignedZeros();
831   AllowReciprocal = FMF.allowReciprocal();
832   AllowContract = FMF.allowContract();
833   ApproxFunc = FMF.approxFunc();
834 }
835 
836 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
837 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
838   switch (OpType) {
839   case OperationType::Cmp:
840     O << " " << CmpInst::getPredicateName(getPredicate());
841     break;
842   case OperationType::DisjointOp:
843     if (DisjointFlags.IsDisjoint)
844       O << " disjoint";
845     break;
846   case OperationType::PossiblyExactOp:
847     if (ExactFlags.IsExact)
848       O << " exact";
849     break;
850   case OperationType::OverflowingBinOp:
851     if (WrapFlags.HasNUW)
852       O << " nuw";
853     if (WrapFlags.HasNSW)
854       O << " nsw";
855     break;
856   case OperationType::FPMathOp:
857     getFastMathFlags().print(O);
858     break;
859   case OperationType::GEPOp:
860     if (GEPFlags.IsInBounds)
861       O << " inbounds";
862     break;
863   case OperationType::NonNegOp:
864     if (NonNegFlags.NonNeg)
865       O << " nneg";
866     break;
867   case OperationType::Other:
868     break;
869   }
870   if (getNumOperands() > 0)
871     O << " ";
872 }
873 #endif
874 
875 void VPWidenRecipe::execute(VPTransformState &State) {
876   State.setDebugLocFrom(getDebugLoc());
877   auto &Builder = State.Builder;
878   switch (Opcode) {
879   case Instruction::Call:
880   case Instruction::Br:
881   case Instruction::PHI:
882   case Instruction::GetElementPtr:
883   case Instruction::Select:
884     llvm_unreachable("This instruction is handled by a different recipe.");
885   case Instruction::UDiv:
886   case Instruction::SDiv:
887   case Instruction::SRem:
888   case Instruction::URem:
889   case Instruction::Add:
890   case Instruction::FAdd:
891   case Instruction::Sub:
892   case Instruction::FSub:
893   case Instruction::FNeg:
894   case Instruction::Mul:
895   case Instruction::FMul:
896   case Instruction::FDiv:
897   case Instruction::FRem:
898   case Instruction::Shl:
899   case Instruction::LShr:
900   case Instruction::AShr:
901   case Instruction::And:
902   case Instruction::Or:
903   case Instruction::Xor: {
904     // Just widen unops and binops.
905     for (unsigned Part = 0; Part < State.UF; ++Part) {
906       SmallVector<Value *, 2> Ops;
907       for (VPValue *VPOp : operands())
908         Ops.push_back(State.get(VPOp, Part));
909 
910       Value *V = Builder.CreateNAryOp(Opcode, Ops);
911 
912       if (auto *VecOp = dyn_cast<Instruction>(V))
913         setFlags(VecOp);
914 
915       // Use this vector value for all users of the original instruction.
916       State.set(this, V, Part);
917       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
918     }
919 
920     break;
921   }
922   case Instruction::Freeze: {
923     for (unsigned Part = 0; Part < State.UF; ++Part) {
924       Value *Op = State.get(getOperand(0), Part);
925 
926       Value *Freeze = Builder.CreateFreeze(Op);
927       State.set(this, Freeze, Part);
928     }
929     break;
930   }
931   case Instruction::ICmp:
932   case Instruction::FCmp: {
933     // Widen compares. Generate vector compares.
934     bool FCmp = Opcode == Instruction::FCmp;
935     for (unsigned Part = 0; Part < State.UF; ++Part) {
936       Value *A = State.get(getOperand(0), Part);
937       Value *B = State.get(getOperand(1), Part);
938       Value *C = nullptr;
939       if (FCmp) {
940         // Propagate fast math flags.
941         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
942         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
943           Builder.setFastMathFlags(I->getFastMathFlags());
944         C = Builder.CreateFCmp(getPredicate(), A, B);
945       } else {
946         C = Builder.CreateICmp(getPredicate(), A, B);
947       }
948       State.set(this, C, Part);
949       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
950     }
951 
952     break;
953   }
954   default:
955     // This instruction is not vectorized by simple widening.
956     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
957                       << Instruction::getOpcodeName(Opcode));
958     llvm_unreachable("Unhandled instruction!");
959   } // end of switch.
960 
961 #if !defined(NDEBUG)
962   // Verify that VPlan type inference results agree with the type of the
963   // generated values.
964   for (unsigned Part = 0; Part < State.UF; ++Part) {
965     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
966                            State.VF) == State.get(this, Part)->getType() &&
967            "inferred type and type from generated instructions do not match");
968   }
969 #endif
970 }
971 
972 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
973 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
974                           VPSlotTracker &SlotTracker) const {
975   O << Indent << "WIDEN ";
976   printAsOperand(O, SlotTracker);
977   O << " = " << Instruction::getOpcodeName(Opcode);
978   printFlags(O);
979   printOperands(O, SlotTracker);
980 }
981 #endif
982 
983 void VPWidenCastRecipe::execute(VPTransformState &State) {
984   State.setDebugLocFrom(getDebugLoc());
985   auto &Builder = State.Builder;
986   /// Vectorize casts.
987   assert(State.VF.isVector() && "Not vectorizing?");
988   Type *DestTy = VectorType::get(getResultType(), State.VF);
989   VPValue *Op = getOperand(0);
990   for (unsigned Part = 0; Part < State.UF; ++Part) {
991     if (Part > 0 && Op->isLiveIn()) {
992       // FIXME: Remove once explicit unrolling is implemented using VPlan.
993       State.set(this, State.get(this, 0), Part);
994       continue;
995     }
996     Value *A = State.get(Op, Part);
997     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
998     State.set(this, Cast, Part);
999     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1000   }
1001 }
1002 
1003 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1004 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1005                               VPSlotTracker &SlotTracker) const {
1006   O << Indent << "WIDEN-CAST ";
1007   printAsOperand(O, SlotTracker);
1008   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1009   printFlags(O);
1010   printOperands(O, SlotTracker);
1011   O << " to " << *getResultType();
1012 }
1013 #endif
1014 
1015 /// This function adds
1016 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1017 /// to each vector element of Val. The sequence starts at StartIndex.
1018 /// \p Opcode is relevant for FP induction variable.
1019 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1020                             Instruction::BinaryOps BinOp, ElementCount VF,
1021                             IRBuilderBase &Builder) {
1022   assert(VF.isVector() && "only vector VFs are supported");
1023 
1024   // Create and check the types.
1025   auto *ValVTy = cast<VectorType>(Val->getType());
1026   ElementCount VLen = ValVTy->getElementCount();
1027 
1028   Type *STy = Val->getType()->getScalarType();
1029   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1030          "Induction Step must be an integer or FP");
1031   assert(Step->getType() == STy && "Step has wrong type");
1032 
1033   SmallVector<Constant *, 8> Indices;
1034 
1035   // Create a vector of consecutive numbers from zero to VF.
1036   VectorType *InitVecValVTy = ValVTy;
1037   if (STy->isFloatingPointTy()) {
1038     Type *InitVecValSTy =
1039         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1040     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1041   }
1042   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1043 
1044   // Splat the StartIdx
1045   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1046 
1047   if (STy->isIntegerTy()) {
1048     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1049     Step = Builder.CreateVectorSplat(VLen, Step);
1050     assert(Step->getType() == Val->getType() && "Invalid step vec");
1051     // FIXME: The newly created binary instructions should contain nsw/nuw
1052     // flags, which can be found from the original scalar operations.
1053     Step = Builder.CreateMul(InitVec, Step);
1054     return Builder.CreateAdd(Val, Step, "induction");
1055   }
1056 
1057   // Floating point induction.
1058   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1059          "Binary Opcode should be specified for FP induction");
1060   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1061   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1062 
1063   Step = Builder.CreateVectorSplat(VLen, Step);
1064   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1065   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1066 }
1067 
1068 /// A helper function that returns an integer or floating-point constant with
1069 /// value C.
1070 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1071   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1072                            : ConstantFP::get(Ty, C);
1073 }
1074 
1075 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1076                                   ElementCount VF) {
1077   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1078   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1079   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1080   return B.CreateUIToFP(RuntimeVF, FTy);
1081 }
1082 
1083 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1084   assert(!State.Instance && "Int or FP induction being replicated.");
1085 
1086   Value *Start = getStartValue()->getLiveInIRValue();
1087   const InductionDescriptor &ID = getInductionDescriptor();
1088   TruncInst *Trunc = getTruncInst();
1089   IRBuilderBase &Builder = State.Builder;
1090   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1091   assert(State.VF.isVector() && "must have vector VF");
1092 
1093   // The value from the original loop to which we are mapping the new induction
1094   // variable.
1095   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1096 
1097   // Fast-math-flags propagate from the original induction instruction.
1098   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1099   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1100     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1101 
1102   // Now do the actual transformations, and start with fetching the step value.
1103   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1104 
1105   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1106          "Expected either an induction phi-node or a truncate of it!");
1107 
1108   // Construct the initial value of the vector IV in the vector loop preheader
1109   auto CurrIP = Builder.saveIP();
1110   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1111   Builder.SetInsertPoint(VectorPH->getTerminator());
1112   if (isa<TruncInst>(EntryVal)) {
1113     assert(Start->getType()->isIntegerTy() &&
1114            "Truncation requires an integer type");
1115     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1116     Step = Builder.CreateTrunc(Step, TruncType);
1117     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1118   }
1119 
1120   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1121   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1122   Value *SteppedStart = getStepVector(
1123       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1124 
1125   // We create vector phi nodes for both integer and floating-point induction
1126   // variables. Here, we determine the kind of arithmetic we will perform.
1127   Instruction::BinaryOps AddOp;
1128   Instruction::BinaryOps MulOp;
1129   if (Step->getType()->isIntegerTy()) {
1130     AddOp = Instruction::Add;
1131     MulOp = Instruction::Mul;
1132   } else {
1133     AddOp = ID.getInductionOpcode();
1134     MulOp = Instruction::FMul;
1135   }
1136 
1137   // Multiply the vectorization factor by the step using integer or
1138   // floating-point arithmetic as appropriate.
1139   Type *StepType = Step->getType();
1140   Value *RuntimeVF;
1141   if (Step->getType()->isFloatingPointTy())
1142     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1143   else
1144     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1145   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1146 
1147   // Create a vector splat to use in the induction update.
1148   //
1149   // FIXME: If the step is non-constant, we create the vector splat with
1150   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1151   //        handle a constant vector splat.
1152   Value *SplatVF = isa<Constant>(Mul)
1153                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1154                        : Builder.CreateVectorSplat(State.VF, Mul);
1155   Builder.restoreIP(CurrIP);
1156 
1157   // We may need to add the step a number of times, depending on the unroll
1158   // factor. The last of those goes into the PHI.
1159   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1160   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1161   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1162   Instruction *LastInduction = VecInd;
1163   for (unsigned Part = 0; Part < State.UF; ++Part) {
1164     State.set(this, LastInduction, Part);
1165 
1166     if (isa<TruncInst>(EntryVal))
1167       State.addMetadata(LastInduction, EntryVal);
1168 
1169     LastInduction = cast<Instruction>(
1170         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1171     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1172   }
1173 
1174   LastInduction->setName("vec.ind.next");
1175   VecInd->addIncoming(SteppedStart, VectorPH);
1176   // Add induction update using an incorrect block temporarily. The phi node
1177   // will be fixed after VPlan execution. Note that at this point the latch
1178   // block cannot be used, as it does not exist yet.
1179   // TODO: Model increment value in VPlan, by turning the recipe into a
1180   // multi-def and a subclass of VPHeaderPHIRecipe.
1181   VecInd->addIncoming(LastInduction, VectorPH);
1182 }
1183 
1184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1185 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1186                                           VPSlotTracker &SlotTracker) const {
1187   O << Indent << "WIDEN-INDUCTION";
1188   if (getTruncInst()) {
1189     O << "\\l\"";
1190     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1191     O << " +\n" << Indent << "\"  ";
1192     getVPValue(0)->printAsOperand(O, SlotTracker);
1193   } else
1194     O << " " << VPlanIngredient(IV);
1195 
1196   O << ", ";
1197   getStepValue()->printAsOperand(O, SlotTracker);
1198 }
1199 #endif
1200 
1201 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1202   // The step may be defined by a recipe in the preheader (e.g. if it requires
1203   // SCEV expansion), but for the canonical induction the step is required to be
1204   // 1, which is represented as live-in.
1205   if (getStepValue()->getDefiningRecipe())
1206     return false;
1207   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1208   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1209   return StartC && StartC->isZero() && StepC && StepC->isOne();
1210 }
1211 
1212 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1213 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1214                               VPSlotTracker &SlotTracker) const {
1215   O << Indent;
1216   printAsOperand(O, SlotTracker);
1217   O << Indent << "= DERIVED-IV ";
1218   getStartValue()->printAsOperand(O, SlotTracker);
1219   O << " + ";
1220   getOperand(1)->printAsOperand(O, SlotTracker);
1221   O << " * ";
1222   getStepValue()->printAsOperand(O, SlotTracker);
1223 }
1224 #endif
1225 
1226 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1227   // Fast-math-flags propagate from the original induction instruction.
1228   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1229   if (hasFastMathFlags())
1230     State.Builder.setFastMathFlags(getFastMathFlags());
1231 
1232   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1233   /// variable on which to base the steps, \p Step is the size of the step.
1234 
1235   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1236   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1237   IRBuilderBase &Builder = State.Builder;
1238 
1239   // Ensure step has the same type as that of scalar IV.
1240   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1241   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1242 
1243   // We build scalar steps for both integer and floating-point induction
1244   // variables. Here, we determine the kind of arithmetic we will perform.
1245   Instruction::BinaryOps AddOp;
1246   Instruction::BinaryOps MulOp;
1247   if (BaseIVTy->isIntegerTy()) {
1248     AddOp = Instruction::Add;
1249     MulOp = Instruction::Mul;
1250   } else {
1251     AddOp = InductionOpcode;
1252     MulOp = Instruction::FMul;
1253   }
1254 
1255   // Determine the number of scalars we need to generate for each unroll
1256   // iteration.
1257   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1258   // Compute the scalar steps and save the results in State.
1259   Type *IntStepTy =
1260       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1261   Type *VecIVTy = nullptr;
1262   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1263   if (!FirstLaneOnly && State.VF.isScalable()) {
1264     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1265     UnitStepVec =
1266         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1267     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1268     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1269   }
1270 
1271   unsigned StartPart = 0;
1272   unsigned EndPart = State.UF;
1273   unsigned StartLane = 0;
1274   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1275   if (State.Instance) {
1276     StartPart = State.Instance->Part;
1277     EndPart = StartPart + 1;
1278     StartLane = State.Instance->Lane.getKnownLane();
1279     EndLane = StartLane + 1;
1280   }
1281   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1282     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1283 
1284     if (!FirstLaneOnly && State.VF.isScalable()) {
1285       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1286       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1287       if (BaseIVTy->isFloatingPointTy())
1288         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1289       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1290       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1291       State.set(this, Add, Part);
1292       // It's useful to record the lane values too for the known minimum number
1293       // of elements so we do those below. This improves the code quality when
1294       // trying to extract the first element, for example.
1295     }
1296 
1297     if (BaseIVTy->isFloatingPointTy())
1298       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1299 
1300     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1301       Value *StartIdx = Builder.CreateBinOp(
1302           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1303       // The step returned by `createStepForVF` is a runtime-evaluated value
1304       // when VF is scalable. Otherwise, it should be folded into a Constant.
1305       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1306              "Expected StartIdx to be folded to a constant when VF is not "
1307              "scalable");
1308       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1309       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1310       State.set(this, Add, VPIteration(Part, Lane));
1311     }
1312   }
1313 }
1314 
1315 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1316 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1317                                   VPSlotTracker &SlotTracker) const {
1318   O << Indent;
1319   printAsOperand(O, SlotTracker);
1320   O << " = SCALAR-STEPS ";
1321   printOperands(O, SlotTracker);
1322 }
1323 #endif
1324 
1325 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1326   assert(State.VF.isVector() && "not widening");
1327   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1328   // Construct a vector GEP by widening the operands of the scalar GEP as
1329   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1330   // results in a vector of pointers when at least one operand of the GEP
1331   // is vector-typed. Thus, to keep the representation compact, we only use
1332   // vector-typed operands for loop-varying values.
1333 
1334   if (areAllOperandsInvariant()) {
1335     // If we are vectorizing, but the GEP has only loop-invariant operands,
1336     // the GEP we build (by only using vector-typed operands for
1337     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1338     // produce a vector of pointers, we need to either arbitrarily pick an
1339     // operand to broadcast, or broadcast a clone of the original GEP.
1340     // Here, we broadcast a clone of the original.
1341     //
1342     // TODO: If at some point we decide to scalarize instructions having
1343     //       loop-invariant operands, this special case will no longer be
1344     //       required. We would add the scalarization decision to
1345     //       collectLoopScalars() and teach getVectorValue() to broadcast
1346     //       the lane-zero scalar value.
1347     SmallVector<Value *> Ops;
1348     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1349       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1350 
1351     auto *NewGEP =
1352         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1353                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1354     for (unsigned Part = 0; Part < State.UF; ++Part) {
1355       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1356       State.set(this, EntryPart, Part);
1357       State.addMetadata(EntryPart, GEP);
1358     }
1359   } else {
1360     // If the GEP has at least one loop-varying operand, we are sure to
1361     // produce a vector of pointers. But if we are only unrolling, we want
1362     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1363     // produce with the code below will be scalar (if VF == 1) or vector
1364     // (otherwise). Note that for the unroll-only case, we still maintain
1365     // values in the vector mapping with initVector, as we do for other
1366     // instructions.
1367     for (unsigned Part = 0; Part < State.UF; ++Part) {
1368       // The pointer operand of the new GEP. If it's loop-invariant, we
1369       // won't broadcast it.
1370       auto *Ptr = isPointerLoopInvariant()
1371                       ? State.get(getOperand(0), VPIteration(0, 0))
1372                       : State.get(getOperand(0), Part);
1373 
1374       // Collect all the indices for the new GEP. If any index is
1375       // loop-invariant, we won't broadcast it.
1376       SmallVector<Value *, 4> Indices;
1377       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1378         VPValue *Operand = getOperand(I);
1379         if (isIndexLoopInvariant(I - 1))
1380           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1381         else
1382           Indices.push_back(State.get(Operand, Part));
1383       }
1384 
1385       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1386       // but it should be a vector, otherwise.
1387       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1388                                              Indices, "", isInBounds());
1389       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1390              "NewGEP is not a pointer vector");
1391       State.set(this, NewGEP, Part);
1392       State.addMetadata(NewGEP, GEP);
1393     }
1394   }
1395 }
1396 
1397 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1398 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1399                              VPSlotTracker &SlotTracker) const {
1400   O << Indent << "WIDEN-GEP ";
1401   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1402   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1403     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1404 
1405   O << " ";
1406   printAsOperand(O, SlotTracker);
1407   O << " = getelementptr";
1408   printFlags(O);
1409   printOperands(O, SlotTracker);
1410 }
1411 #endif
1412 
1413 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1414   auto &Builder = State.Builder;
1415   State.setDebugLocFrom(getDebugLoc());
1416   for (unsigned Part = 0; Part < State.UF; ++Part) {
1417     // Calculate the pointer for the specific unroll-part.
1418     Value *PartPtr = nullptr;
1419     // Use i32 for the gep index type when the value is constant,
1420     // or query DataLayout for a more suitable index type otherwise.
1421     const DataLayout &DL =
1422         Builder.GetInsertBlock()->getModule()->getDataLayout();
1423     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1424                         ? DL.getIndexType(IndexedTy->getPointerTo())
1425                         : Builder.getInt32Ty();
1426     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1427     bool InBounds = isInBounds();
1428     if (IsReverse) {
1429       // If the address is consecutive but reversed, then the
1430       // wide store needs to start at the last vector element.
1431       // RunTimeVF =  VScale * VF.getKnownMinValue()
1432       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1433       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1434       // NumElt = -Part * RunTimeVF
1435       Value *NumElt = Builder.CreateMul(
1436           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1437       // LastLane = 1 - RunTimeVF
1438       Value *LastLane =
1439           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1440       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1441       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1442     } else {
1443       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1444       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1445     }
1446 
1447     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1448   }
1449 }
1450 
1451 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1452 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1453                                   VPSlotTracker &SlotTracker) const {
1454   O << Indent;
1455   printAsOperand(O, SlotTracker);
1456   O << " = vector-pointer ";
1457   if (IsReverse)
1458     O << "(reverse) ";
1459 
1460   printOperands(O, SlotTracker);
1461 }
1462 #endif
1463 
1464 void VPBlendRecipe::execute(VPTransformState &State) {
1465   State.setDebugLocFrom(getDebugLoc());
1466   // We know that all PHIs in non-header blocks are converted into
1467   // selects, so we don't have to worry about the insertion order and we
1468   // can just use the builder.
1469   // At this point we generate the predication tree. There may be
1470   // duplications since this is a simple recursive scan, but future
1471   // optimizations will clean it up.
1472 
1473   unsigned NumIncoming = getNumIncomingValues();
1474 
1475   // Generate a sequence of selects of the form:
1476   // SELECT(Mask3, In3,
1477   //        SELECT(Mask2, In2,
1478   //               SELECT(Mask1, In1,
1479   //                      In0)))
1480   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1481   // are essentially undef are taken from In0.
1482  VectorParts Entry(State.UF);
1483   for (unsigned In = 0; In < NumIncoming; ++In) {
1484     for (unsigned Part = 0; Part < State.UF; ++Part) {
1485       // We might have single edge PHIs (blocks) - use an identity
1486       // 'select' for the first PHI operand.
1487       Value *In0 = State.get(getIncomingValue(In), Part);
1488       if (In == 0)
1489         Entry[Part] = In0; // Initialize with the first incoming value.
1490       else {
1491         // Select between the current value and the previous incoming edge
1492         // based on the incoming mask.
1493         Value *Cond = State.get(getMask(In), Part);
1494         Entry[Part] =
1495             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1496       }
1497     }
1498   }
1499   for (unsigned Part = 0; Part < State.UF; ++Part)
1500     State.set(this, Entry[Part], Part);
1501 }
1502 
1503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1504 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1505                           VPSlotTracker &SlotTracker) const {
1506   O << Indent << "BLEND ";
1507   printAsOperand(O, SlotTracker);
1508   O << " =";
1509   if (getNumIncomingValues() == 1) {
1510     // Not a User of any mask: not really blending, this is a
1511     // single-predecessor phi.
1512     O << " ";
1513     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1514   } else {
1515     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1516       O << " ";
1517       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1518       if (I == 0)
1519         continue;
1520       O << "/";
1521       getMask(I)->printAsOperand(O, SlotTracker);
1522     }
1523   }
1524 }
1525 #endif
1526 
1527 void VPReductionRecipe::execute(VPTransformState &State) {
1528   assert(!State.Instance && "Reduction being replicated.");
1529   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1530   RecurKind Kind = RdxDesc.getRecurrenceKind();
1531   // Propagate the fast-math flags carried by the underlying instruction.
1532   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1533   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1534   for (unsigned Part = 0; Part < State.UF; ++Part) {
1535     Value *NewVecOp = State.get(getVecOp(), Part);
1536     if (VPValue *Cond = getCondOp()) {
1537       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1538       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1539       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1540       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1541                                                   RdxDesc.getFastMathFlags());
1542       if (State.VF.isVector()) {
1543         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1544       }
1545 
1546       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1547       NewVecOp = Select;
1548     }
1549     Value *NewRed;
1550     Value *NextInChain;
1551     if (IsOrdered) {
1552       if (State.VF.isVector())
1553         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1554                                         PrevInChain);
1555       else
1556         NewRed = State.Builder.CreateBinOp(
1557             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1558             NewVecOp);
1559       PrevInChain = NewRed;
1560     } else {
1561       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1562       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1563     }
1564     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1565       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1566                                    NewRed, PrevInChain);
1567     } else if (IsOrdered)
1568       NextInChain = NewRed;
1569     else
1570       NextInChain = State.Builder.CreateBinOp(
1571           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1572     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1573   }
1574 }
1575 
1576 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1577 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1578                               VPSlotTracker &SlotTracker) const {
1579   O << Indent << "REDUCE ";
1580   printAsOperand(O, SlotTracker);
1581   O << " = ";
1582   getChainOp()->printAsOperand(O, SlotTracker);
1583   O << " +";
1584   if (isa<FPMathOperator>(getUnderlyingInstr()))
1585     O << getUnderlyingInstr()->getFastMathFlags();
1586   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1587   getVecOp()->printAsOperand(O, SlotTracker);
1588   if (getCondOp()) {
1589     O << ", ";
1590     getCondOp()->printAsOperand(O, SlotTracker);
1591   }
1592   O << ")";
1593   if (RdxDesc.IntermediateStore)
1594     O << " (with final reduction value stored in invariant address sank "
1595          "outside of loop)";
1596 }
1597 #endif
1598 
1599 bool VPReplicateRecipe::shouldPack() const {
1600   // Find if the recipe is used by a widened recipe via an intervening
1601   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1602   return any_of(users(), [](const VPUser *U) {
1603     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1604       return any_of(PredR->users(), [PredR](const VPUser *U) {
1605         return !U->usesScalars(PredR);
1606       });
1607     return false;
1608   });
1609 }
1610 
1611 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1612 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1613                               VPSlotTracker &SlotTracker) const {
1614   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1615 
1616   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1617     printAsOperand(O, SlotTracker);
1618     O << " = ";
1619   }
1620   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1621     O << "call";
1622     printFlags(O);
1623     O << "@" << CB->getCalledFunction()->getName() << "(";
1624     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1625                     O, [&O, &SlotTracker](VPValue *Op) {
1626                       Op->printAsOperand(O, SlotTracker);
1627                     });
1628     O << ")";
1629   } else {
1630     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1631     printFlags(O);
1632     printOperands(O, SlotTracker);
1633   }
1634 
1635   if (shouldPack())
1636     O << " (S->V)";
1637 }
1638 #endif
1639 
1640 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1641 /// if it is either defined outside the vector region or its operand is known to
1642 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1643 /// TODO: Uniformity should be associated with a VPValue and there should be a
1644 /// generic way to check.
1645 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1646   return C->isDefinedOutsideVectorRegions() ||
1647          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1648          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1649 }
1650 
1651 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1652   assert(vputils::onlyFirstLaneUsed(this) &&
1653          "Codegen only implemented for first lane.");
1654   switch (Opcode) {
1655   case Instruction::SExt:
1656   case Instruction::ZExt:
1657   case Instruction::Trunc: {
1658     // Note: SExt/ZExt not used yet.
1659     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1660     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1661   }
1662   default:
1663     llvm_unreachable("opcode not implemented yet");
1664   }
1665 }
1666 
1667 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1668   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1669   for (unsigned Part = 0; Part != State.UF; ++Part) {
1670     Value *Res;
1671     // Only generate a single instance, if the recipe is uniform across UFs and
1672     // VFs.
1673     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1674       Res = State.get(this, VPIteration(0, 0));
1675     else
1676       Res = generate(State, Part);
1677     State.set(this, Res, VPIteration(Part, 0));
1678   }
1679 }
1680 
1681 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1682 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1683                                 VPSlotTracker &SlotTracker) const {
1684   O << Indent << "SCALAR-CAST ";
1685   printAsOperand(O, SlotTracker);
1686   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1687   printOperands(O, SlotTracker);
1688   O << " to " << *ResultTy;
1689 }
1690 #endif
1691 
1692 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1693   assert(State.Instance && "Branch on Mask works only on single instance.");
1694 
1695   unsigned Part = State.Instance->Part;
1696   unsigned Lane = State.Instance->Lane.getKnownLane();
1697 
1698   Value *ConditionBit = nullptr;
1699   VPValue *BlockInMask = getMask();
1700   if (BlockInMask) {
1701     ConditionBit = State.get(BlockInMask, Part);
1702     if (ConditionBit->getType()->isVectorTy())
1703       ConditionBit = State.Builder.CreateExtractElement(
1704           ConditionBit, State.Builder.getInt32(Lane));
1705   } else // Block in mask is all-one.
1706     ConditionBit = State.Builder.getTrue();
1707 
1708   // Replace the temporary unreachable terminator with a new conditional branch,
1709   // whose two destinations will be set later when they are created.
1710   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1711   assert(isa<UnreachableInst>(CurrentTerminator) &&
1712          "Expected to replace unreachable terminator with conditional branch.");
1713   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1714   CondBr->setSuccessor(0, nullptr);
1715   ReplaceInstWithInst(CurrentTerminator, CondBr);
1716 }
1717 
1718 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1719   assert(State.Instance && "Predicated instruction PHI works per instance.");
1720   Instruction *ScalarPredInst =
1721       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1722   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1723   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1724   assert(PredicatingBB && "Predicated block has no single predecessor.");
1725   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1726          "operand must be VPReplicateRecipe");
1727 
1728   // By current pack/unpack logic we need to generate only a single phi node: if
1729   // a vector value for the predicated instruction exists at this point it means
1730   // the instruction has vector users only, and a phi for the vector value is
1731   // needed. In this case the recipe of the predicated instruction is marked to
1732   // also do that packing, thereby "hoisting" the insert-element sequence.
1733   // Otherwise, a phi node for the scalar value is needed.
1734   unsigned Part = State.Instance->Part;
1735   if (State.hasVectorValue(getOperand(0), Part)) {
1736     Value *VectorValue = State.get(getOperand(0), Part);
1737     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1738     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1739     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1740     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1741     if (State.hasVectorValue(this, Part))
1742       State.reset(this, VPhi, Part);
1743     else
1744       State.set(this, VPhi, Part);
1745     // NOTE: Currently we need to update the value of the operand, so the next
1746     // predicated iteration inserts its generated value in the correct vector.
1747     State.reset(getOperand(0), VPhi, Part);
1748   } else {
1749     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1750     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1751     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1752                      PredicatingBB);
1753     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1754     if (State.hasScalarValue(this, *State.Instance))
1755       State.reset(this, Phi, *State.Instance);
1756     else
1757       State.set(this, Phi, *State.Instance);
1758     // NOTE: Currently we need to update the value of the operand, so the next
1759     // predicated iteration inserts its generated value in the correct vector.
1760     State.reset(getOperand(0), Phi, *State.Instance);
1761   }
1762 }
1763 
1764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1765 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1766                                 VPSlotTracker &SlotTracker) const {
1767   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1768   printAsOperand(O, SlotTracker);
1769   O << " = ";
1770   printOperands(O, SlotTracker);
1771 }
1772 
1773 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1774                               VPSlotTracker &SlotTracker) const {
1775   O << Indent << "WIDEN ";
1776   printAsOperand(O, SlotTracker);
1777   O << " = load ";
1778   printOperands(O, SlotTracker);
1779 }
1780 
1781 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1782                                VPSlotTracker &SlotTracker) const {
1783   O << Indent << "WIDEN store ";
1784   printOperands(O, SlotTracker);
1785 }
1786 #endif
1787 
1788 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1789   Value *Start = getStartValue()->getLiveInIRValue();
1790   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1791   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1792 
1793   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1794   EntryPart->addIncoming(Start, VectorPH);
1795   EntryPart->setDebugLoc(getDebugLoc());
1796   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1797     State.set(this, EntryPart, Part, /*IsScalar*/ true);
1798 }
1799 
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1802                                    VPSlotTracker &SlotTracker) const {
1803   O << Indent << "EMIT ";
1804   printAsOperand(O, SlotTracker);
1805   O << " = CANONICAL-INDUCTION ";
1806   printOperands(O, SlotTracker);
1807 }
1808 #endif
1809 
1810 bool VPCanonicalIVPHIRecipe::isCanonical(
1811     InductionDescriptor::InductionKind Kind, VPValue *Start,
1812     VPValue *Step) const {
1813   // Must be an integer induction.
1814   if (Kind != InductionDescriptor::IK_IntInduction)
1815     return false;
1816   // Start must match the start value of this canonical induction.
1817   if (Start != getStartValue())
1818     return false;
1819 
1820   // If the step is defined by a recipe, it is not a ConstantInt.
1821   if (Step->getDefiningRecipe())
1822     return false;
1823 
1824   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1825   return StepC && StepC->isOne();
1826 }
1827 
1828 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1829   return IsScalarAfterVectorization &&
1830          (!IsScalable || vputils::onlyFirstLaneUsed(this));
1831 }
1832 
1833 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1834 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1835                                           VPSlotTracker &SlotTracker) const {
1836   O << Indent << "EMIT ";
1837   printAsOperand(O, SlotTracker);
1838   O << " = WIDEN-POINTER-INDUCTION ";
1839   getStartValue()->printAsOperand(O, SlotTracker);
1840   O << ", " << *IndDesc.getStep();
1841 }
1842 #endif
1843 
1844 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1845   assert(!State.Instance && "cannot be used in per-lane");
1846   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1847   SCEVExpander Exp(SE, DL, "induction");
1848 
1849   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1850                                  &*State.Builder.GetInsertPoint());
1851   assert(!State.ExpandedSCEVs.contains(Expr) &&
1852          "Same SCEV expanded multiple times");
1853   State.ExpandedSCEVs[Expr] = Res;
1854   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1855     State.set(this, Res, {Part, 0});
1856 }
1857 
1858 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1859 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1860                                VPSlotTracker &SlotTracker) const {
1861   O << Indent << "EMIT ";
1862   getVPSingleValue()->printAsOperand(O, SlotTracker);
1863   O << " = EXPAND SCEV " << *Expr;
1864 }
1865 #endif
1866 
1867 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1868   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
1869   Type *STy = CanonicalIV->getType();
1870   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1871   ElementCount VF = State.VF;
1872   Value *VStart = VF.isScalar()
1873                       ? CanonicalIV
1874                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1875   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1876     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1877     if (VF.isVector()) {
1878       VStep = Builder.CreateVectorSplat(VF, VStep);
1879       VStep =
1880           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1881     }
1882     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1883     State.set(this, CanonicalVectorIV, Part);
1884   }
1885 }
1886 
1887 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1888 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1889                                      VPSlotTracker &SlotTracker) const {
1890   O << Indent << "EMIT ";
1891   printAsOperand(O, SlotTracker);
1892   O << " = WIDEN-CANONICAL-INDUCTION ";
1893   printOperands(O, SlotTracker);
1894 }
1895 #endif
1896 
1897 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1898   auto &Builder = State.Builder;
1899   // Create a vector from the initial value.
1900   auto *VectorInit = getStartValue()->getLiveInIRValue();
1901 
1902   Type *VecTy = State.VF.isScalar()
1903                     ? VectorInit->getType()
1904                     : VectorType::get(VectorInit->getType(), State.VF);
1905 
1906   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1907   if (State.VF.isVector()) {
1908     auto *IdxTy = Builder.getInt32Ty();
1909     auto *One = ConstantInt::get(IdxTy, 1);
1910     IRBuilder<>::InsertPointGuard Guard(Builder);
1911     Builder.SetInsertPoint(VectorPH->getTerminator());
1912     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1913     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1914     VectorInit = Builder.CreateInsertElement(
1915         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1916   }
1917 
1918   // Create a phi node for the new recurrence.
1919   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1920   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1921   EntryPart->addIncoming(VectorInit, VectorPH);
1922   State.set(this, EntryPart, 0);
1923 }
1924 
1925 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1926 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1927                                             VPSlotTracker &SlotTracker) const {
1928   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1929   printAsOperand(O, SlotTracker);
1930   O << " = phi ";
1931   printOperands(O, SlotTracker);
1932 }
1933 #endif
1934 
1935 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1936   auto &Builder = State.Builder;
1937 
1938   // Reductions do not have to start at zero. They can start with
1939   // any loop invariant values.
1940   VPValue *StartVPV = getStartValue();
1941   Value *StartV = StartVPV->getLiveInIRValue();
1942 
1943   // In order to support recurrences we need to be able to vectorize Phi nodes.
1944   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1945   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1946   // this value when we vectorize all of the instructions that use the PHI.
1947   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1948   Type *VecTy = ScalarPHI ? StartV->getType()
1949                           : VectorType::get(StartV->getType(), State.VF);
1950 
1951   BasicBlock *HeaderBB = State.CFG.PrevBB;
1952   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1953          "recipe must be in the vector loop header");
1954   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1955   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1956     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1957     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1958     State.set(this, EntryPart, Part, IsInLoop);
1959   }
1960 
1961   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1962 
1963   Value *Iden = nullptr;
1964   RecurKind RK = RdxDesc.getRecurrenceKind();
1965   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1966       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1967     // MinMax and AnyOf reductions have the start value as their identity.
1968     if (ScalarPHI) {
1969       Iden = StartV;
1970     } else {
1971       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1972       Builder.SetInsertPoint(VectorPH->getTerminator());
1973       StartV = Iden =
1974           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1975     }
1976   } else {
1977     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1978                                          RdxDesc.getFastMathFlags());
1979 
1980     if (!ScalarPHI) {
1981       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1982       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1983       Builder.SetInsertPoint(VectorPH->getTerminator());
1984       Constant *Zero = Builder.getInt32(0);
1985       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1986     }
1987   }
1988 
1989   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1990     Value *EntryPart = State.get(this, Part, IsInLoop);
1991     // Make sure to add the reduction start value only to the
1992     // first unroll part.
1993     Value *StartVal = (Part == 0) ? StartV : Iden;
1994     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1995   }
1996 }
1997 
1998 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1999 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2000                                  VPSlotTracker &SlotTracker) const {
2001   O << Indent << "WIDEN-REDUCTION-PHI ";
2002 
2003   printAsOperand(O, SlotTracker);
2004   O << " = phi ";
2005   printOperands(O, SlotTracker);
2006 }
2007 #endif
2008 
2009 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2010   assert(EnableVPlanNativePath &&
2011          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2012 
2013   Value *Op0 = State.get(getOperand(0), 0);
2014   Type *VecTy = Op0->getType();
2015   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2016   State.set(this, VecPhi, 0);
2017 }
2018 
2019 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2020 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2021                              VPSlotTracker &SlotTracker) const {
2022   O << Indent << "WIDEN-PHI ";
2023 
2024   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2025   // Unless all incoming values are modeled in VPlan  print the original PHI
2026   // directly.
2027   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2028   // values as VPValues.
2029   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2030     O << VPlanIngredient(OriginalPhi);
2031     return;
2032   }
2033 
2034   printAsOperand(O, SlotTracker);
2035   O << " = phi ";
2036   printOperands(O, SlotTracker);
2037 }
2038 #endif
2039 
2040 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2041 // remove VPActiveLaneMaskPHIRecipe.
2042 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2043   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2044   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2045     Value *StartMask = State.get(getOperand(0), Part);
2046     PHINode *EntryPart =
2047         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2048     EntryPart->addIncoming(StartMask, VectorPH);
2049     EntryPart->setDebugLoc(getDebugLoc());
2050     State.set(this, EntryPart, Part);
2051   }
2052 }
2053 
2054 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2055 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2056                                       VPSlotTracker &SlotTracker) const {
2057   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2058 
2059   printAsOperand(O, SlotTracker);
2060   O << " = phi ";
2061   printOperands(O, SlotTracker);
2062 }
2063 #endif
2064 
2065 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2066   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2067   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2068   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2069   PHINode *EntryPart =
2070       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2071   EntryPart->addIncoming(Start, VectorPH);
2072   EntryPart->setDebugLoc(getDebugLoc());
2073   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2074 }
2075 
2076 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2077 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2078                                   VPSlotTracker &SlotTracker) const {
2079   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2080 
2081   printAsOperand(O, SlotTracker);
2082   O << " = phi ";
2083   printOperands(O, SlotTracker);
2084 }
2085 #endif
2086