xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 647cbc5de815c5651677bf8582797f716ec7b48d)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
32 #include <cassert>
33 
34 using namespace llvm;
35 
36 using VectorParts = SmallVector<Value *, 2>;
37 
38 namespace llvm {
39 extern cl::opt<bool> EnableVPlanNativePath;
40 }
41 
42 #define LV_NAME "loop-vectorize"
43 #define DEBUG_TYPE LV_NAME
44 
45 bool VPRecipeBase::mayWriteToMemory() const {
46   switch (getVPDefID()) {
47   case VPInterleaveSC:
48     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
49   case VPWidenMemoryInstructionSC: {
50     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
51   }
52   case VPReplicateSC:
53   case VPWidenCallSC:
54     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
55         ->mayWriteToMemory();
56   case VPBranchOnMaskSC:
57   case VPScalarIVStepsSC:
58   case VPPredInstPHISC:
59     return false;
60   case VPBlendSC:
61   case VPReductionSC:
62   case VPWidenCanonicalIVSC:
63   case VPWidenCastSC:
64   case VPWidenGEPSC:
65   case VPWidenIntOrFpInductionSC:
66   case VPWidenPHISC:
67   case VPWidenSC:
68   case VPWidenSelectSC: {
69     const Instruction *I =
70         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
71     (void)I;
72     assert((!I || !I->mayWriteToMemory()) &&
73            "underlying instruction may write to memory");
74     return false;
75   }
76   default:
77     return true;
78   }
79 }
80 
81 bool VPRecipeBase::mayReadFromMemory() const {
82   switch (getVPDefID()) {
83   case VPWidenMemoryInstructionSC: {
84     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
85   }
86   case VPReplicateSC:
87   case VPWidenCallSC:
88     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
89         ->mayReadFromMemory();
90   case VPBranchOnMaskSC:
91   case VPScalarIVStepsSC:
92   case VPPredInstPHISC:
93     return false;
94   case VPBlendSC:
95   case VPReductionSC:
96   case VPWidenCanonicalIVSC:
97   case VPWidenCastSC:
98   case VPWidenGEPSC:
99   case VPWidenIntOrFpInductionSC:
100   case VPWidenPHISC:
101   case VPWidenSC:
102   case VPWidenSelectSC: {
103     const Instruction *I =
104         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
105     (void)I;
106     assert((!I || !I->mayReadFromMemory()) &&
107            "underlying instruction may read from memory");
108     return false;
109   }
110   default:
111     return true;
112   }
113 }
114 
115 bool VPRecipeBase::mayHaveSideEffects() const {
116   switch (getVPDefID()) {
117   case VPDerivedIVSC:
118   case VPPredInstPHISC:
119     return false;
120   case VPInstructionSC:
121     switch (cast<VPInstruction>(this)->getOpcode()) {
122     case Instruction::ICmp:
123     case VPInstruction::Not:
124     case VPInstruction::CalculateTripCountMinusVF:
125     case VPInstruction::CanonicalIVIncrementForPart:
126       return false;
127     default:
128       return true;
129     }
130   case VPWidenCallSC:
131     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
132         ->mayHaveSideEffects();
133   case VPBlendSC:
134   case VPReductionSC:
135   case VPScalarIVStepsSC:
136   case VPWidenCanonicalIVSC:
137   case VPWidenCastSC:
138   case VPWidenGEPSC:
139   case VPWidenIntOrFpInductionSC:
140   case VPWidenPHISC:
141   case VPWidenPointerInductionSC:
142   case VPWidenSC:
143   case VPWidenSelectSC: {
144     const Instruction *I =
145         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
146     (void)I;
147     assert((!I || !I->mayHaveSideEffects()) &&
148            "underlying instruction has side-effects");
149     return false;
150   }
151   case VPInterleaveSC:
152     return mayWriteToMemory();
153   case VPWidenMemoryInstructionSC:
154     assert(cast<VPWidenMemoryInstructionRecipe>(this)
155                    ->getIngredient()
156                    .mayHaveSideEffects() == mayWriteToMemory() &&
157            "mayHaveSideffects result for ingredient differs from this "
158            "implementation");
159     return mayWriteToMemory();
160   case VPReplicateSC: {
161     auto *R = cast<VPReplicateRecipe>(this);
162     return R->getUnderlyingInstr()->mayHaveSideEffects();
163   }
164   default:
165     return true;
166   }
167 }
168 
169 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
170   auto Lane = VPLane::getLastLaneForVF(State.VF);
171   VPValue *ExitValue = getOperand(0);
172   if (vputils::isUniformAfterVectorization(ExitValue))
173     Lane = VPLane::getFirstLane();
174   VPBasicBlock *MiddleVPBB =
175       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
176   assert(MiddleVPBB->getNumSuccessors() == 0 &&
177          "the middle block must not have any successors");
178   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
179   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
180                    MiddleBB);
181 }
182 
183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
184 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
185   O << "Live-out ";
186   getPhi()->printAsOperand(O);
187   O << " = ";
188   getOperand(0)->printAsOperand(O, SlotTracker);
189   O << "\n";
190 }
191 #endif
192 
193 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
194   assert(!Parent && "Recipe already in some VPBasicBlock");
195   assert(InsertPos->getParent() &&
196          "Insertion position not in any VPBasicBlock");
197   Parent = InsertPos->getParent();
198   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
199 }
200 
201 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
202                                 iplist<VPRecipeBase>::iterator I) {
203   assert(!Parent && "Recipe already in some VPBasicBlock");
204   assert(I == BB.end() || I->getParent() == &BB);
205   Parent = &BB;
206   BB.getRecipeList().insert(I, this);
207 }
208 
209 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
210   assert(!Parent && "Recipe already in some VPBasicBlock");
211   assert(InsertPos->getParent() &&
212          "Insertion position not in any VPBasicBlock");
213   Parent = InsertPos->getParent();
214   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
215 }
216 
217 void VPRecipeBase::removeFromParent() {
218   assert(getParent() && "Recipe not in any VPBasicBlock");
219   getParent()->getRecipeList().remove(getIterator());
220   Parent = nullptr;
221 }
222 
223 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
224   assert(getParent() && "Recipe not in any VPBasicBlock");
225   return getParent()->getRecipeList().erase(getIterator());
226 }
227 
228 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
229   removeFromParent();
230   insertAfter(InsertPos);
231 }
232 
233 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
234                               iplist<VPRecipeBase>::iterator I) {
235   removeFromParent();
236   insertBefore(BB, I);
237 }
238 
239 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
240   assert(OpType == OperationType::FPMathOp &&
241          "recipe doesn't have fast math flags");
242   FastMathFlags Res;
243   Res.setAllowReassoc(FMFs.AllowReassoc);
244   Res.setNoNaNs(FMFs.NoNaNs);
245   Res.setNoInfs(FMFs.NoInfs);
246   Res.setNoSignedZeros(FMFs.NoSignedZeros);
247   Res.setAllowReciprocal(FMFs.AllowReciprocal);
248   Res.setAllowContract(FMFs.AllowContract);
249   Res.setApproxFunc(FMFs.ApproxFunc);
250   return Res;
251 }
252 
253 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
254                              VPValue *A, VPValue *B, DebugLoc DL,
255                              const Twine &Name)
256     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
257                           Pred, DL),
258       VPValue(this), Opcode(Opcode), Name(Name.str()) {
259   assert(Opcode == Instruction::ICmp &&
260          "only ICmp predicates supported at the moment");
261 }
262 
263 VPInstruction::VPInstruction(unsigned Opcode,
264                              std::initializer_list<VPValue *> Operands,
265                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
266     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
267       VPValue(this), Opcode(Opcode), Name(Name.str()) {
268   // Make sure the VPInstruction is a floating-point operation.
269   assert(isFPMathOp() && "this op can't take fast-math flags");
270 }
271 
272 Value *VPInstruction::generateInstruction(VPTransformState &State,
273                                           unsigned Part) {
274   IRBuilderBase &Builder = State.Builder;
275   Builder.SetCurrentDebugLocation(getDebugLoc());
276 
277   if (Instruction::isBinaryOp(getOpcode())) {
278     if (Part != 0 && vputils::onlyFirstPartUsed(this))
279       return State.get(this, 0);
280 
281     Value *A = State.get(getOperand(0), Part);
282     Value *B = State.get(getOperand(1), Part);
283     auto *Res =
284         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
285     if (auto *I = dyn_cast<Instruction>(Res))
286       setFlags(I);
287     return Res;
288   }
289 
290   switch (getOpcode()) {
291   case VPInstruction::Not: {
292     Value *A = State.get(getOperand(0), Part);
293     return Builder.CreateNot(A, Name);
294   }
295   case Instruction::ICmp: {
296     Value *A = State.get(getOperand(0), Part);
297     Value *B = State.get(getOperand(1), Part);
298     return Builder.CreateCmp(getPredicate(), A, B, Name);
299   }
300   case Instruction::Select: {
301     Value *Cond = State.get(getOperand(0), Part);
302     Value *Op1 = State.get(getOperand(1), Part);
303     Value *Op2 = State.get(getOperand(2), Part);
304     return Builder.CreateSelect(Cond, Op1, Op2, Name);
305   }
306   case VPInstruction::ActiveLaneMask: {
307     // Get first lane of vector induction variable.
308     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
309     // Get the original loop tripcount.
310     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
311 
312     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
313     auto *PredTy = VectorType::get(Int1Ty, State.VF);
314     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
315                                    {PredTy, ScalarTC->getType()},
316                                    {VIVElem0, ScalarTC}, nullptr, Name);
317   }
318   case VPInstruction::FirstOrderRecurrenceSplice: {
319     // Generate code to combine the previous and current values in vector v3.
320     //
321     //   vector.ph:
322     //     v_init = vector(..., ..., ..., a[-1])
323     //     br vector.body
324     //
325     //   vector.body
326     //     i = phi [0, vector.ph], [i+4, vector.body]
327     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
328     //     v2 = a[i, i+1, i+2, i+3];
329     //     v3 = vector(v1(3), v2(0, 1, 2))
330 
331     // For the first part, use the recurrence phi (v1), otherwise v2.
332     auto *V1 = State.get(getOperand(0), 0);
333     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
334     if (!PartMinus1->getType()->isVectorTy())
335       return PartMinus1;
336     Value *V2 = State.get(getOperand(1), Part);
337     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
338   }
339   case VPInstruction::CalculateTripCountMinusVF: {
340     Value *ScalarTC = State.get(getOperand(0), {0, 0});
341     Value *Step =
342         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
343     Value *Sub = Builder.CreateSub(ScalarTC, Step);
344     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
345     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
346     return Builder.CreateSelect(Cmp, Sub, Zero);
347   }
348   case VPInstruction::CanonicalIVIncrementForPart: {
349     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
350     if (Part == 0)
351       return IV;
352 
353     // The canonical IV is incremented by the vectorization factor (num of SIMD
354     // elements) times the unroll part.
355     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
356     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
357                              hasNoSignedWrap());
358   }
359   case VPInstruction::BranchOnCond: {
360     if (Part != 0)
361       return nullptr;
362 
363     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
364     VPRegionBlock *ParentRegion = getParent()->getParent();
365     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
366 
367     // Replace the temporary unreachable terminator with a new conditional
368     // branch, hooking it up to backward destination for exiting blocks now and
369     // to forward destination(s) later when they are created.
370     BranchInst *CondBr =
371         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
372 
373     if (getParent()->isExiting())
374       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
375 
376     CondBr->setSuccessor(0, nullptr);
377     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
378     return CondBr;
379   }
380   case VPInstruction::BranchOnCount: {
381     if (Part != 0)
382       return nullptr;
383     // First create the compare.
384     Value *IV = State.get(getOperand(0), Part);
385     Value *TC = State.get(getOperand(1), Part);
386     Value *Cond = Builder.CreateICmpEQ(IV, TC);
387 
388     // Now create the branch.
389     auto *Plan = getParent()->getPlan();
390     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
391     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
392 
393     // Replace the temporary unreachable terminator with a new conditional
394     // branch, hooking it up to backward destination (the header) now and to the
395     // forward destination (the exit/middle block) later when it is created.
396     // Note that CreateCondBr expects a valid BB as first argument, so we need
397     // to set it to nullptr later.
398     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
399                                               State.CFG.VPBB2IRBB[Header]);
400     CondBr->setSuccessor(0, nullptr);
401     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
402     return CondBr;
403   }
404   default:
405     llvm_unreachable("Unsupported opcode for instruction");
406   }
407 }
408 
409 #if !defined(NDEBUG)
410 bool VPInstruction::isFPMathOp() const {
411   // Inspired by FPMathOperator::classof. Notable differences are that we don't
412   // support Call, PHI and Select opcodes here yet.
413   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
414          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
415          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
416          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
417 }
418 #endif
419 
420 void VPInstruction::execute(VPTransformState &State) {
421   assert(!State.Instance && "VPInstruction executing an Instance");
422   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
423   assert((hasFastMathFlags() == isFPMathOp() ||
424           getOpcode() == Instruction::Select) &&
425          "Recipe not a FPMathOp but has fast-math flags?");
426   if (hasFastMathFlags())
427     State.Builder.setFastMathFlags(getFastMathFlags());
428   for (unsigned Part = 0; Part < State.UF; ++Part) {
429     Value *GeneratedValue = generateInstruction(State, Part);
430     if (!hasResult())
431       continue;
432     assert(GeneratedValue && "generateInstruction must produce a value");
433     State.set(this, GeneratedValue, Part);
434   }
435 }
436 
437 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
438 void VPInstruction::dump() const {
439   VPSlotTracker SlotTracker(getParent()->getPlan());
440   print(dbgs(), "", SlotTracker);
441 }
442 
443 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
444                           VPSlotTracker &SlotTracker) const {
445   O << Indent << "EMIT ";
446 
447   if (hasResult()) {
448     printAsOperand(O, SlotTracker);
449     O << " = ";
450   }
451 
452   switch (getOpcode()) {
453   case VPInstruction::Not:
454     O << "not";
455     break;
456   case VPInstruction::SLPLoad:
457     O << "combined load";
458     break;
459   case VPInstruction::SLPStore:
460     O << "combined store";
461     break;
462   case VPInstruction::ActiveLaneMask:
463     O << "active lane mask";
464     break;
465   case VPInstruction::FirstOrderRecurrenceSplice:
466     O << "first-order splice";
467     break;
468   case VPInstruction::BranchOnCond:
469     O << "branch-on-cond";
470     break;
471   case VPInstruction::CalculateTripCountMinusVF:
472     O << "TC > VF ? TC - VF : 0";
473     break;
474   case VPInstruction::CanonicalIVIncrementForPart:
475     O << "VF * Part +";
476     break;
477   case VPInstruction::BranchOnCount:
478     O << "branch-on-count";
479     break;
480   default:
481     O << Instruction::getOpcodeName(getOpcode());
482   }
483 
484   printFlags(O);
485   printOperands(O, SlotTracker);
486 
487   if (auto DL = getDebugLoc()) {
488     O << ", !dbg ";
489     DL.print(O);
490   }
491 }
492 #endif
493 
494 void VPWidenCallRecipe::execute(VPTransformState &State) {
495   assert(State.VF.isVector() && "not widening");
496   auto &CI = *cast<CallInst>(getUnderlyingInstr());
497   assert(!isa<DbgInfoIntrinsic>(CI) &&
498          "DbgInfoIntrinsic should have been dropped during VPlan construction");
499   State.setDebugLocFrom(CI.getDebugLoc());
500 
501   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
502   FunctionType *VFTy = nullptr;
503   if (Variant)
504     VFTy = Variant->getFunctionType();
505   for (unsigned Part = 0; Part < State.UF; ++Part) {
506     SmallVector<Type *, 2> TysForDecl;
507     // Add return type if intrinsic is overloaded on it.
508     if (UseIntrinsic &&
509         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
510       TysForDecl.push_back(
511           VectorType::get(CI.getType()->getScalarType(), State.VF));
512     SmallVector<Value *, 4> Args;
513     for (const auto &I : enumerate(operands())) {
514       // Some intrinsics have a scalar argument - don't replace it with a
515       // vector.
516       // Some vectorized function variants may also take a scalar argument,
517       // e.g. linear parameters for pointers.
518       Value *Arg;
519       if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
520           (UseIntrinsic &&
521            isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
522         Arg = State.get(I.value(), VPIteration(0, 0));
523       else
524         Arg = State.get(I.value(), Part);
525       if (UseIntrinsic &&
526           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
527         TysForDecl.push_back(Arg->getType());
528       Args.push_back(Arg);
529     }
530 
531     Function *VectorF;
532     if (UseIntrinsic) {
533       // Use vector version of the intrinsic.
534       Module *M = State.Builder.GetInsertBlock()->getModule();
535       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
536       assert(VectorF && "Can't retrieve vector intrinsic.");
537     } else {
538 #ifndef NDEBUG
539       assert(Variant != nullptr && "Can't create vector function.");
540 #endif
541       VectorF = Variant;
542     }
543 
544     SmallVector<OperandBundleDef, 1> OpBundles;
545     CI.getOperandBundlesAsDefs(OpBundles);
546     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
547 
548     if (isa<FPMathOperator>(V))
549       V->copyFastMathFlags(&CI);
550 
551     State.set(this, V, Part);
552     State.addMetadata(V, &CI);
553   }
554 }
555 
556 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
557 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
558                               VPSlotTracker &SlotTracker) const {
559   O << Indent << "WIDEN-CALL ";
560 
561   auto *CI = cast<CallInst>(getUnderlyingInstr());
562   if (CI->getType()->isVoidTy())
563     O << "void ";
564   else {
565     printAsOperand(O, SlotTracker);
566     O << " = ";
567   }
568 
569   O << "call @" << CI->getCalledFunction()->getName() << "(";
570   printOperands(O, SlotTracker);
571   O << ")";
572 
573   if (VectorIntrinsicID)
574     O << " (using vector intrinsic)";
575   else {
576     O << " (using library function";
577     if (Variant->hasName())
578       O << ": " << Variant->getName();
579     O << ")";
580   }
581 }
582 
583 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
584                                 VPSlotTracker &SlotTracker) const {
585   O << Indent << "WIDEN-SELECT ";
586   printAsOperand(O, SlotTracker);
587   O << " = select ";
588   getOperand(0)->printAsOperand(O, SlotTracker);
589   O << ", ";
590   getOperand(1)->printAsOperand(O, SlotTracker);
591   O << ", ";
592   getOperand(2)->printAsOperand(O, SlotTracker);
593   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
594 }
595 #endif
596 
597 void VPWidenSelectRecipe::execute(VPTransformState &State) {
598   State.setDebugLocFrom(getDebugLoc());
599 
600   // The condition can be loop invariant but still defined inside the
601   // loop. This means that we can't just use the original 'cond' value.
602   // We have to take the 'vectorized' value and pick the first lane.
603   // Instcombine will make this a no-op.
604   auto *InvarCond =
605       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
606 
607   for (unsigned Part = 0; Part < State.UF; ++Part) {
608     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
609     Value *Op0 = State.get(getOperand(1), Part);
610     Value *Op1 = State.get(getOperand(2), Part);
611     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
612     State.set(this, Sel, Part);
613     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
614   }
615 }
616 
617 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
618     const FastMathFlags &FMF) {
619   AllowReassoc = FMF.allowReassoc();
620   NoNaNs = FMF.noNaNs();
621   NoInfs = FMF.noInfs();
622   NoSignedZeros = FMF.noSignedZeros();
623   AllowReciprocal = FMF.allowReciprocal();
624   AllowContract = FMF.allowContract();
625   ApproxFunc = FMF.approxFunc();
626 }
627 
628 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
629 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
630   switch (OpType) {
631   case OperationType::Cmp:
632     O << " " << CmpInst::getPredicateName(getPredicate());
633     break;
634   case OperationType::DisjointOp:
635     if (DisjointFlags.IsDisjoint)
636       O << " disjoint";
637     break;
638   case OperationType::PossiblyExactOp:
639     if (ExactFlags.IsExact)
640       O << " exact";
641     break;
642   case OperationType::OverflowingBinOp:
643     if (WrapFlags.HasNUW)
644       O << " nuw";
645     if (WrapFlags.HasNSW)
646       O << " nsw";
647     break;
648   case OperationType::FPMathOp:
649     getFastMathFlags().print(O);
650     break;
651   case OperationType::GEPOp:
652     if (GEPFlags.IsInBounds)
653       O << " inbounds";
654     break;
655   case OperationType::NonNegOp:
656     if (NonNegFlags.NonNeg)
657       O << " nneg";
658     break;
659   case OperationType::Other:
660     break;
661   }
662   if (getNumOperands() > 0)
663     O << " ";
664 }
665 #endif
666 
667 void VPWidenRecipe::execute(VPTransformState &State) {
668   State.setDebugLocFrom(getDebugLoc());
669   auto &Builder = State.Builder;
670   switch (Opcode) {
671   case Instruction::Call:
672   case Instruction::Br:
673   case Instruction::PHI:
674   case Instruction::GetElementPtr:
675   case Instruction::Select:
676     llvm_unreachable("This instruction is handled by a different recipe.");
677   case Instruction::UDiv:
678   case Instruction::SDiv:
679   case Instruction::SRem:
680   case Instruction::URem:
681   case Instruction::Add:
682   case Instruction::FAdd:
683   case Instruction::Sub:
684   case Instruction::FSub:
685   case Instruction::FNeg:
686   case Instruction::Mul:
687   case Instruction::FMul:
688   case Instruction::FDiv:
689   case Instruction::FRem:
690   case Instruction::Shl:
691   case Instruction::LShr:
692   case Instruction::AShr:
693   case Instruction::And:
694   case Instruction::Or:
695   case Instruction::Xor: {
696     // Just widen unops and binops.
697     for (unsigned Part = 0; Part < State.UF; ++Part) {
698       SmallVector<Value *, 2> Ops;
699       for (VPValue *VPOp : operands())
700         Ops.push_back(State.get(VPOp, Part));
701 
702       Value *V = Builder.CreateNAryOp(Opcode, Ops);
703 
704       if (auto *VecOp = dyn_cast<Instruction>(V))
705         setFlags(VecOp);
706 
707       // Use this vector value for all users of the original instruction.
708       State.set(this, V, Part);
709       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
710     }
711 
712     break;
713   }
714   case Instruction::Freeze: {
715     for (unsigned Part = 0; Part < State.UF; ++Part) {
716       Value *Op = State.get(getOperand(0), Part);
717 
718       Value *Freeze = Builder.CreateFreeze(Op);
719       State.set(this, Freeze, Part);
720     }
721     break;
722   }
723   case Instruction::ICmp:
724   case Instruction::FCmp: {
725     // Widen compares. Generate vector compares.
726     bool FCmp = Opcode == Instruction::FCmp;
727     for (unsigned Part = 0; Part < State.UF; ++Part) {
728       Value *A = State.get(getOperand(0), Part);
729       Value *B = State.get(getOperand(1), Part);
730       Value *C = nullptr;
731       if (FCmp) {
732         // Propagate fast math flags.
733         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
734         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
735           Builder.setFastMathFlags(I->getFastMathFlags());
736         C = Builder.CreateFCmp(getPredicate(), A, B);
737       } else {
738         C = Builder.CreateICmp(getPredicate(), A, B);
739       }
740       State.set(this, C, Part);
741       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
742     }
743 
744     break;
745   }
746   default:
747     // This instruction is not vectorized by simple widening.
748     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
749                       << Instruction::getOpcodeName(Opcode));
750     llvm_unreachable("Unhandled instruction!");
751   } // end of switch.
752 
753 #if !defined(NDEBUG)
754   // Verify that VPlan type inference results agree with the type of the
755   // generated values.
756   for (unsigned Part = 0; Part < State.UF; ++Part) {
757     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
758                            State.VF) == State.get(this, Part)->getType() &&
759            "inferred type and type from generated instructions do not match");
760   }
761 #endif
762 }
763 
764 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
765 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
766                           VPSlotTracker &SlotTracker) const {
767   O << Indent << "WIDEN ";
768   printAsOperand(O, SlotTracker);
769   O << " = " << Instruction::getOpcodeName(Opcode);
770   printFlags(O);
771   printOperands(O, SlotTracker);
772 }
773 #endif
774 
775 void VPWidenCastRecipe::execute(VPTransformState &State) {
776   State.setDebugLocFrom(getDebugLoc());
777   auto &Builder = State.Builder;
778   /// Vectorize casts.
779   assert(State.VF.isVector() && "Not vectorizing?");
780   Type *DestTy = VectorType::get(getResultType(), State.VF);
781   VPValue *Op = getOperand(0);
782   for (unsigned Part = 0; Part < State.UF; ++Part) {
783     if (Part > 0 && Op->isLiveIn()) {
784       // FIXME: Remove once explicit unrolling is implemented using VPlan.
785       State.set(this, State.get(this, 0), Part);
786       continue;
787     }
788     Value *A = State.get(Op, Part);
789     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
790     State.set(this, Cast, Part);
791     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
792   }
793 }
794 
795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
796 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
797                               VPSlotTracker &SlotTracker) const {
798   O << Indent << "WIDEN-CAST ";
799   printAsOperand(O, SlotTracker);
800   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
801   printFlags(O);
802   printOperands(O, SlotTracker);
803   O << " to " << *getResultType();
804 }
805 #endif
806 
807 /// This function adds
808 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
809 /// to each vector element of Val. The sequence starts at StartIndex.
810 /// \p Opcode is relevant for FP induction variable.
811 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
812                             Instruction::BinaryOps BinOp, ElementCount VF,
813                             IRBuilderBase &Builder) {
814   assert(VF.isVector() && "only vector VFs are supported");
815 
816   // Create and check the types.
817   auto *ValVTy = cast<VectorType>(Val->getType());
818   ElementCount VLen = ValVTy->getElementCount();
819 
820   Type *STy = Val->getType()->getScalarType();
821   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
822          "Induction Step must be an integer or FP");
823   assert(Step->getType() == STy && "Step has wrong type");
824 
825   SmallVector<Constant *, 8> Indices;
826 
827   // Create a vector of consecutive numbers from zero to VF.
828   VectorType *InitVecValVTy = ValVTy;
829   if (STy->isFloatingPointTy()) {
830     Type *InitVecValSTy =
831         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
832     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
833   }
834   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
835 
836   // Splat the StartIdx
837   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
838 
839   if (STy->isIntegerTy()) {
840     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
841     Step = Builder.CreateVectorSplat(VLen, Step);
842     assert(Step->getType() == Val->getType() && "Invalid step vec");
843     // FIXME: The newly created binary instructions should contain nsw/nuw
844     // flags, which can be found from the original scalar operations.
845     Step = Builder.CreateMul(InitVec, Step);
846     return Builder.CreateAdd(Val, Step, "induction");
847   }
848 
849   // Floating point induction.
850   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
851          "Binary Opcode should be specified for FP induction");
852   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
853   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
854 
855   Step = Builder.CreateVectorSplat(VLen, Step);
856   Value *MulOp = Builder.CreateFMul(InitVec, Step);
857   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
858 }
859 
860 /// A helper function that returns an integer or floating-point constant with
861 /// value C.
862 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
863   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
864                            : ConstantFP::get(Ty, C);
865 }
866 
867 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
868                                   ElementCount VF) {
869   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
870   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
871   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
872   return B.CreateUIToFP(RuntimeVF, FTy);
873 }
874 
875 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
876   assert(!State.Instance && "Int or FP induction being replicated.");
877 
878   Value *Start = getStartValue()->getLiveInIRValue();
879   const InductionDescriptor &ID = getInductionDescriptor();
880   TruncInst *Trunc = getTruncInst();
881   IRBuilderBase &Builder = State.Builder;
882   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
883   assert(State.VF.isVector() && "must have vector VF");
884 
885   // The value from the original loop to which we are mapping the new induction
886   // variable.
887   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
888 
889   // Fast-math-flags propagate from the original induction instruction.
890   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
891   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
892     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
893 
894   // Now do the actual transformations, and start with fetching the step value.
895   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
896 
897   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
898          "Expected either an induction phi-node or a truncate of it!");
899 
900   // Construct the initial value of the vector IV in the vector loop preheader
901   auto CurrIP = Builder.saveIP();
902   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
903   Builder.SetInsertPoint(VectorPH->getTerminator());
904   if (isa<TruncInst>(EntryVal)) {
905     assert(Start->getType()->isIntegerTy() &&
906            "Truncation requires an integer type");
907     auto *TruncType = cast<IntegerType>(EntryVal->getType());
908     Step = Builder.CreateTrunc(Step, TruncType);
909     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
910   }
911 
912   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
913   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
914   Value *SteppedStart = getStepVector(
915       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
916 
917   // We create vector phi nodes for both integer and floating-point induction
918   // variables. Here, we determine the kind of arithmetic we will perform.
919   Instruction::BinaryOps AddOp;
920   Instruction::BinaryOps MulOp;
921   if (Step->getType()->isIntegerTy()) {
922     AddOp = Instruction::Add;
923     MulOp = Instruction::Mul;
924   } else {
925     AddOp = ID.getInductionOpcode();
926     MulOp = Instruction::FMul;
927   }
928 
929   // Multiply the vectorization factor by the step using integer or
930   // floating-point arithmetic as appropriate.
931   Type *StepType = Step->getType();
932   Value *RuntimeVF;
933   if (Step->getType()->isFloatingPointTy())
934     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
935   else
936     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
937   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
938 
939   // Create a vector splat to use in the induction update.
940   //
941   // FIXME: If the step is non-constant, we create the vector splat with
942   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
943   //        handle a constant vector splat.
944   Value *SplatVF = isa<Constant>(Mul)
945                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
946                        : Builder.CreateVectorSplat(State.VF, Mul);
947   Builder.restoreIP(CurrIP);
948 
949   // We may need to add the step a number of times, depending on the unroll
950   // factor. The last of those goes into the PHI.
951   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
952   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
953   VecInd->setDebugLoc(EntryVal->getDebugLoc());
954   Instruction *LastInduction = VecInd;
955   for (unsigned Part = 0; Part < State.UF; ++Part) {
956     State.set(this, LastInduction, Part);
957 
958     if (isa<TruncInst>(EntryVal))
959       State.addMetadata(LastInduction, EntryVal);
960 
961     LastInduction = cast<Instruction>(
962         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
963     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
964   }
965 
966   LastInduction->setName("vec.ind.next");
967   VecInd->addIncoming(SteppedStart, VectorPH);
968   // Add induction update using an incorrect block temporarily. The phi node
969   // will be fixed after VPlan execution. Note that at this point the latch
970   // block cannot be used, as it does not exist yet.
971   // TODO: Model increment value in VPlan, by turning the recipe into a
972   // multi-def and a subclass of VPHeaderPHIRecipe.
973   VecInd->addIncoming(LastInduction, VectorPH);
974 }
975 
976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
977 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
978                                           VPSlotTracker &SlotTracker) const {
979   O << Indent << "WIDEN-INDUCTION";
980   if (getTruncInst()) {
981     O << "\\l\"";
982     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
983     O << " +\n" << Indent << "\"  ";
984     getVPValue(0)->printAsOperand(O, SlotTracker);
985   } else
986     O << " " << VPlanIngredient(IV);
987 
988   O << ", ";
989   getStepValue()->printAsOperand(O, SlotTracker);
990 }
991 #endif
992 
993 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
994   // The step may be defined by a recipe in the preheader (e.g. if it requires
995   // SCEV expansion), but for the canonical induction the step is required to be
996   // 1, which is represented as live-in.
997   if (getStepValue()->getDefiningRecipe())
998     return false;
999   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1000   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1001   return StartC && StartC->isZero() && StepC && StepC->isOne();
1002 }
1003 
1004 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1005 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1006                               VPSlotTracker &SlotTracker) const {
1007   O << Indent;
1008   printAsOperand(O, SlotTracker);
1009   O << Indent << "= DERIVED-IV ";
1010   getStartValue()->printAsOperand(O, SlotTracker);
1011   O << " + ";
1012   getCanonicalIV()->printAsOperand(O, SlotTracker);
1013   O << " * ";
1014   getStepValue()->printAsOperand(O, SlotTracker);
1015 
1016   if (TruncResultTy)
1017     O << " (truncated to " << *TruncResultTy << ")";
1018 }
1019 #endif
1020 
1021 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1022   // Fast-math-flags propagate from the original induction instruction.
1023   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1024   if (hasFastMathFlags())
1025     State.Builder.setFastMathFlags(getFastMathFlags());
1026 
1027   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1028   /// variable on which to base the steps, \p Step is the size of the step.
1029 
1030   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1031   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1032   IRBuilderBase &Builder = State.Builder;
1033 
1034   // Ensure step has the same type as that of scalar IV.
1035   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1036   if (BaseIVTy != Step->getType()) {
1037     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1038     // avoid separate truncate here.
1039     assert(Step->getType()->isIntegerTy() &&
1040            "Truncation requires an integer step");
1041     Step = State.Builder.CreateTrunc(Step, BaseIVTy);
1042   }
1043 
1044   // We build scalar steps for both integer and floating-point induction
1045   // variables. Here, we determine the kind of arithmetic we will perform.
1046   Instruction::BinaryOps AddOp;
1047   Instruction::BinaryOps MulOp;
1048   if (BaseIVTy->isIntegerTy()) {
1049     AddOp = Instruction::Add;
1050     MulOp = Instruction::Mul;
1051   } else {
1052     AddOp = InductionOpcode;
1053     MulOp = Instruction::FMul;
1054   }
1055 
1056   // Determine the number of scalars we need to generate for each unroll
1057   // iteration.
1058   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1059   // Compute the scalar steps and save the results in State.
1060   Type *IntStepTy =
1061       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1062   Type *VecIVTy = nullptr;
1063   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1064   if (!FirstLaneOnly && State.VF.isScalable()) {
1065     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1066     UnitStepVec =
1067         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1068     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1069     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1070   }
1071 
1072   unsigned StartPart = 0;
1073   unsigned EndPart = State.UF;
1074   unsigned StartLane = 0;
1075   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1076   if (State.Instance) {
1077     StartPart = State.Instance->Part;
1078     EndPart = StartPart + 1;
1079     StartLane = State.Instance->Lane.getKnownLane();
1080     EndLane = StartLane + 1;
1081   }
1082   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1083     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1084 
1085     if (!FirstLaneOnly && State.VF.isScalable()) {
1086       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1087       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1088       if (BaseIVTy->isFloatingPointTy())
1089         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1090       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1091       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1092       State.set(this, Add, Part);
1093       // It's useful to record the lane values too for the known minimum number
1094       // of elements so we do those below. This improves the code quality when
1095       // trying to extract the first element, for example.
1096     }
1097 
1098     if (BaseIVTy->isFloatingPointTy())
1099       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1100 
1101     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1102       Value *StartIdx = Builder.CreateBinOp(
1103           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1104       // The step returned by `createStepForVF` is a runtime-evaluated value
1105       // when VF is scalable. Otherwise, it should be folded into a Constant.
1106       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1107              "Expected StartIdx to be folded to a constant when VF is not "
1108              "scalable");
1109       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1110       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1111       State.set(this, Add, VPIteration(Part, Lane));
1112     }
1113   }
1114 }
1115 
1116 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1117 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1118                                   VPSlotTracker &SlotTracker) const {
1119   O << Indent;
1120   printAsOperand(O, SlotTracker);
1121   O << " = SCALAR-STEPS ";
1122   printOperands(O, SlotTracker);
1123 }
1124 #endif
1125 
1126 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1127   assert(State.VF.isVector() && "not widening");
1128   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1129   // Construct a vector GEP by widening the operands of the scalar GEP as
1130   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1131   // results in a vector of pointers when at least one operand of the GEP
1132   // is vector-typed. Thus, to keep the representation compact, we only use
1133   // vector-typed operands for loop-varying values.
1134 
1135   if (areAllOperandsInvariant()) {
1136     // If we are vectorizing, but the GEP has only loop-invariant operands,
1137     // the GEP we build (by only using vector-typed operands for
1138     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1139     // produce a vector of pointers, we need to either arbitrarily pick an
1140     // operand to broadcast, or broadcast a clone of the original GEP.
1141     // Here, we broadcast a clone of the original.
1142     //
1143     // TODO: If at some point we decide to scalarize instructions having
1144     //       loop-invariant operands, this special case will no longer be
1145     //       required. We would add the scalarization decision to
1146     //       collectLoopScalars() and teach getVectorValue() to broadcast
1147     //       the lane-zero scalar value.
1148     SmallVector<Value *> Ops;
1149     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1150       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1151 
1152     auto *NewGEP =
1153         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1154                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1155     for (unsigned Part = 0; Part < State.UF; ++Part) {
1156       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1157       State.set(this, EntryPart, Part);
1158       State.addMetadata(EntryPart, GEP);
1159     }
1160   } else {
1161     // If the GEP has at least one loop-varying operand, we are sure to
1162     // produce a vector of pointers. But if we are only unrolling, we want
1163     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1164     // produce with the code below will be scalar (if VF == 1) or vector
1165     // (otherwise). Note that for the unroll-only case, we still maintain
1166     // values in the vector mapping with initVector, as we do for other
1167     // instructions.
1168     for (unsigned Part = 0; Part < State.UF; ++Part) {
1169       // The pointer operand of the new GEP. If it's loop-invariant, we
1170       // won't broadcast it.
1171       auto *Ptr = isPointerLoopInvariant()
1172                       ? State.get(getOperand(0), VPIteration(0, 0))
1173                       : State.get(getOperand(0), Part);
1174 
1175       // Collect all the indices for the new GEP. If any index is
1176       // loop-invariant, we won't broadcast it.
1177       SmallVector<Value *, 4> Indices;
1178       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1179         VPValue *Operand = getOperand(I);
1180         if (isIndexLoopInvariant(I - 1))
1181           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1182         else
1183           Indices.push_back(State.get(Operand, Part));
1184       }
1185 
1186       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1187       // but it should be a vector, otherwise.
1188       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1189                                              Indices, "", isInBounds());
1190       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1191              "NewGEP is not a pointer vector");
1192       State.set(this, NewGEP, Part);
1193       State.addMetadata(NewGEP, GEP);
1194     }
1195   }
1196 }
1197 
1198 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1199 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1200                              VPSlotTracker &SlotTracker) const {
1201   O << Indent << "WIDEN-GEP ";
1202   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1203   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1204     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1205 
1206   O << " ";
1207   printAsOperand(O, SlotTracker);
1208   O << " = getelementptr";
1209   printFlags(O);
1210   printOperands(O, SlotTracker);
1211 }
1212 #endif
1213 
1214 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1215   auto &Builder = State.Builder;
1216   State.setDebugLocFrom(getDebugLoc());
1217   for (unsigned Part = 0; Part < State.UF; ++Part) {
1218     // Calculate the pointer for the specific unroll-part.
1219     Value *PartPtr = nullptr;
1220     // Use i32 for the gep index type when the value is constant,
1221     // or query DataLayout for a more suitable index type otherwise.
1222     const DataLayout &DL =
1223         Builder.GetInsertBlock()->getModule()->getDataLayout();
1224     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1225                         ? DL.getIndexType(IndexedTy->getPointerTo())
1226                         : Builder.getInt32Ty();
1227     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1228     bool InBounds = false;
1229     if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
1230       InBounds = GEP->isInBounds();
1231     if (IsReverse) {
1232       // If the address is consecutive but reversed, then the
1233       // wide store needs to start at the last vector element.
1234       // RunTimeVF =  VScale * VF.getKnownMinValue()
1235       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1236       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1237       // NumElt = -Part * RunTimeVF
1238       Value *NumElt = Builder.CreateMul(
1239           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1240       // LastLane = 1 - RunTimeVF
1241       Value *LastLane =
1242           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1243       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1244       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1245     } else {
1246       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1247       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1248     }
1249 
1250     State.set(this, PartPtr, Part);
1251   }
1252 }
1253 
1254 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1255 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1256                                   VPSlotTracker &SlotTracker) const {
1257   O << Indent;
1258   printAsOperand(O, SlotTracker);
1259   O << " = vector-pointer ";
1260   if (IsReverse)
1261     O << "(reverse) ";
1262 
1263   printOperands(O, SlotTracker);
1264 }
1265 #endif
1266 
1267 void VPBlendRecipe::execute(VPTransformState &State) {
1268   State.setDebugLocFrom(getDebugLoc());
1269   // We know that all PHIs in non-header blocks are converted into
1270   // selects, so we don't have to worry about the insertion order and we
1271   // can just use the builder.
1272   // At this point we generate the predication tree. There may be
1273   // duplications since this is a simple recursive scan, but future
1274   // optimizations will clean it up.
1275 
1276   unsigned NumIncoming = getNumIncomingValues();
1277 
1278   // Generate a sequence of selects of the form:
1279   // SELECT(Mask3, In3,
1280   //        SELECT(Mask2, In2,
1281   //               SELECT(Mask1, In1,
1282   //                      In0)))
1283   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1284   // are essentially undef are taken from In0.
1285  VectorParts Entry(State.UF);
1286   for (unsigned In = 0; In < NumIncoming; ++In) {
1287     for (unsigned Part = 0; Part < State.UF; ++Part) {
1288       // We might have single edge PHIs (blocks) - use an identity
1289       // 'select' for the first PHI operand.
1290       Value *In0 = State.get(getIncomingValue(In), Part);
1291       if (In == 0)
1292         Entry[Part] = In0; // Initialize with the first incoming value.
1293       else {
1294         // Select between the current value and the previous incoming edge
1295         // based on the incoming mask.
1296         Value *Cond = State.get(getMask(In), Part);
1297         Entry[Part] =
1298             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1299       }
1300     }
1301   }
1302   for (unsigned Part = 0; Part < State.UF; ++Part)
1303     State.set(this, Entry[Part], Part);
1304 }
1305 
1306 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1307 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1308                           VPSlotTracker &SlotTracker) const {
1309   O << Indent << "BLEND ";
1310   printAsOperand(O, SlotTracker);
1311   O << " =";
1312   if (getNumIncomingValues() == 1) {
1313     // Not a User of any mask: not really blending, this is a
1314     // single-predecessor phi.
1315     O << " ";
1316     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1317   } else {
1318     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1319       O << " ";
1320       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1321       O << "/";
1322       getMask(I)->printAsOperand(O, SlotTracker);
1323     }
1324   }
1325 }
1326 
1327 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1328                               VPSlotTracker &SlotTracker) const {
1329   O << Indent << "REDUCE ";
1330   printAsOperand(O, SlotTracker);
1331   O << " = ";
1332   getChainOp()->printAsOperand(O, SlotTracker);
1333   O << " +";
1334   if (isa<FPMathOperator>(getUnderlyingInstr()))
1335     O << getUnderlyingInstr()->getFastMathFlags();
1336   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1337   getVecOp()->printAsOperand(O, SlotTracker);
1338   if (getCondOp()) {
1339     O << ", ";
1340     getCondOp()->printAsOperand(O, SlotTracker);
1341   }
1342   O << ")";
1343   if (RdxDesc.IntermediateStore)
1344     O << " (with final reduction value stored in invariant address sank "
1345          "outside of loop)";
1346 }
1347 #endif
1348 
1349 bool VPReplicateRecipe::shouldPack() const {
1350   // Find if the recipe is used by a widened recipe via an intervening
1351   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1352   return any_of(users(), [](const VPUser *U) {
1353     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1354       return any_of(PredR->users(), [PredR](const VPUser *U) {
1355         return !U->usesScalars(PredR);
1356       });
1357     return false;
1358   });
1359 }
1360 
1361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1362 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1363                               VPSlotTracker &SlotTracker) const {
1364   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1365 
1366   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1367     printAsOperand(O, SlotTracker);
1368     O << " = ";
1369   }
1370   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1371     O << "call";
1372     printFlags(O);
1373     O << "@" << CB->getCalledFunction()->getName() << "(";
1374     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1375                     O, [&O, &SlotTracker](VPValue *Op) {
1376                       Op->printAsOperand(O, SlotTracker);
1377                     });
1378     O << ")";
1379   } else {
1380     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1381     printFlags(O);
1382     printOperands(O, SlotTracker);
1383   }
1384 
1385   if (shouldPack())
1386     O << " (S->V)";
1387 }
1388 #endif
1389 
1390 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1391   assert(State.Instance && "Branch on Mask works only on single instance.");
1392 
1393   unsigned Part = State.Instance->Part;
1394   unsigned Lane = State.Instance->Lane.getKnownLane();
1395 
1396   Value *ConditionBit = nullptr;
1397   VPValue *BlockInMask = getMask();
1398   if (BlockInMask) {
1399     ConditionBit = State.get(BlockInMask, Part);
1400     if (ConditionBit->getType()->isVectorTy())
1401       ConditionBit = State.Builder.CreateExtractElement(
1402           ConditionBit, State.Builder.getInt32(Lane));
1403   } else // Block in mask is all-one.
1404     ConditionBit = State.Builder.getTrue();
1405 
1406   // Replace the temporary unreachable terminator with a new conditional branch,
1407   // whose two destinations will be set later when they are created.
1408   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1409   assert(isa<UnreachableInst>(CurrentTerminator) &&
1410          "Expected to replace unreachable terminator with conditional branch.");
1411   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1412   CondBr->setSuccessor(0, nullptr);
1413   ReplaceInstWithInst(CurrentTerminator, CondBr);
1414 }
1415 
1416 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1417   assert(State.Instance && "Predicated instruction PHI works per instance.");
1418   Instruction *ScalarPredInst =
1419       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1420   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1421   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1422   assert(PredicatingBB && "Predicated block has no single predecessor.");
1423   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1424          "operand must be VPReplicateRecipe");
1425 
1426   // By current pack/unpack logic we need to generate only a single phi node: if
1427   // a vector value for the predicated instruction exists at this point it means
1428   // the instruction has vector users only, and a phi for the vector value is
1429   // needed. In this case the recipe of the predicated instruction is marked to
1430   // also do that packing, thereby "hoisting" the insert-element sequence.
1431   // Otherwise, a phi node for the scalar value is needed.
1432   unsigned Part = State.Instance->Part;
1433   if (State.hasVectorValue(getOperand(0), Part)) {
1434     Value *VectorValue = State.get(getOperand(0), Part);
1435     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1436     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1437     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1438     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1439     if (State.hasVectorValue(this, Part))
1440       State.reset(this, VPhi, Part);
1441     else
1442       State.set(this, VPhi, Part);
1443     // NOTE: Currently we need to update the value of the operand, so the next
1444     // predicated iteration inserts its generated value in the correct vector.
1445     State.reset(getOperand(0), VPhi, Part);
1446   } else {
1447     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1448     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1449     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1450                      PredicatingBB);
1451     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1452     if (State.hasScalarValue(this, *State.Instance))
1453       State.reset(this, Phi, *State.Instance);
1454     else
1455       State.set(this, Phi, *State.Instance);
1456     // NOTE: Currently we need to update the value of the operand, so the next
1457     // predicated iteration inserts its generated value in the correct vector.
1458     State.reset(getOperand(0), Phi, *State.Instance);
1459   }
1460 }
1461 
1462 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1463 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1464                                 VPSlotTracker &SlotTracker) const {
1465   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1466   printAsOperand(O, SlotTracker);
1467   O << " = ";
1468   printOperands(O, SlotTracker);
1469 }
1470 
1471 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1472                                            VPSlotTracker &SlotTracker) const {
1473   O << Indent << "WIDEN ";
1474 
1475   if (!isStore()) {
1476     getVPSingleValue()->printAsOperand(O, SlotTracker);
1477     O << " = ";
1478   }
1479   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1480 
1481   printOperands(O, SlotTracker);
1482 }
1483 #endif
1484 
1485 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1486   Value *Start = getStartValue()->getLiveInIRValue();
1487   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1488   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1489 
1490   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1491   EntryPart->addIncoming(Start, VectorPH);
1492   EntryPart->setDebugLoc(getDebugLoc());
1493   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1494     State.set(this, EntryPart, Part);
1495 }
1496 
1497 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1498 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1499                                    VPSlotTracker &SlotTracker) const {
1500   O << Indent << "EMIT ";
1501   printAsOperand(O, SlotTracker);
1502   O << " = CANONICAL-INDUCTION ";
1503   printOperands(O, SlotTracker);
1504 }
1505 #endif
1506 
1507 bool VPCanonicalIVPHIRecipe::isCanonical(
1508     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1509     Type *Ty) const {
1510   // The types must match and it must be an integer induction.
1511   if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1512     return false;
1513   // Start must match the start value of this canonical induction.
1514   if (Start != getStartValue())
1515     return false;
1516 
1517   // If the step is defined by a recipe, it is not a ConstantInt.
1518   if (Step->getDefiningRecipe())
1519     return false;
1520 
1521   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1522   return StepC && StepC->isOne();
1523 }
1524 
1525 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1526   return IsScalarAfterVectorization &&
1527          (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1528 }
1529 
1530 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1531 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1532                                           VPSlotTracker &SlotTracker) const {
1533   O << Indent << "EMIT ";
1534   printAsOperand(O, SlotTracker);
1535   O << " = WIDEN-POINTER-INDUCTION ";
1536   getStartValue()->printAsOperand(O, SlotTracker);
1537   O << ", " << *IndDesc.getStep();
1538 }
1539 #endif
1540 
1541 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1542   assert(!State.Instance && "cannot be used in per-lane");
1543   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1544   SCEVExpander Exp(SE, DL, "induction");
1545 
1546   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1547                                  &*State.Builder.GetInsertPoint());
1548   assert(!State.ExpandedSCEVs.contains(Expr) &&
1549          "Same SCEV expanded multiple times");
1550   State.ExpandedSCEVs[Expr] = Res;
1551   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1552     State.set(this, Res, {Part, 0});
1553 }
1554 
1555 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1556 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1557                                VPSlotTracker &SlotTracker) const {
1558   O << Indent << "EMIT ";
1559   getVPSingleValue()->printAsOperand(O, SlotTracker);
1560   O << " = EXPAND SCEV " << *Expr;
1561 }
1562 #endif
1563 
1564 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1565   Value *CanonicalIV = State.get(getOperand(0), 0);
1566   Type *STy = CanonicalIV->getType();
1567   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1568   ElementCount VF = State.VF;
1569   Value *VStart = VF.isScalar()
1570                       ? CanonicalIV
1571                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1572   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1573     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1574     if (VF.isVector()) {
1575       VStep = Builder.CreateVectorSplat(VF, VStep);
1576       VStep =
1577           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1578     }
1579     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1580     State.set(this, CanonicalVectorIV, Part);
1581   }
1582 }
1583 
1584 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1585 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1586                                      VPSlotTracker &SlotTracker) const {
1587   O << Indent << "EMIT ";
1588   printAsOperand(O, SlotTracker);
1589   O << " = WIDEN-CANONICAL-INDUCTION ";
1590   printOperands(O, SlotTracker);
1591 }
1592 #endif
1593 
1594 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1595   auto &Builder = State.Builder;
1596   // Create a vector from the initial value.
1597   auto *VectorInit = getStartValue()->getLiveInIRValue();
1598 
1599   Type *VecTy = State.VF.isScalar()
1600                     ? VectorInit->getType()
1601                     : VectorType::get(VectorInit->getType(), State.VF);
1602 
1603   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1604   if (State.VF.isVector()) {
1605     auto *IdxTy = Builder.getInt32Ty();
1606     auto *One = ConstantInt::get(IdxTy, 1);
1607     IRBuilder<>::InsertPointGuard Guard(Builder);
1608     Builder.SetInsertPoint(VectorPH->getTerminator());
1609     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1610     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1611     VectorInit = Builder.CreateInsertElement(
1612         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1613   }
1614 
1615   // Create a phi node for the new recurrence.
1616   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1617   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1618   EntryPart->addIncoming(VectorInit, VectorPH);
1619   State.set(this, EntryPart, 0);
1620 }
1621 
1622 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1623 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1624                                             VPSlotTracker &SlotTracker) const {
1625   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1626   printAsOperand(O, SlotTracker);
1627   O << " = phi ";
1628   printOperands(O, SlotTracker);
1629 }
1630 #endif
1631 
1632 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1633   PHINode *PN = cast<PHINode>(getUnderlyingValue());
1634   auto &Builder = State.Builder;
1635 
1636   // In order to support recurrences we need to be able to vectorize Phi nodes.
1637   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1638   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1639   // this value when we vectorize all of the instructions that use the PHI.
1640   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1641   Type *VecTy =
1642       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1643 
1644   BasicBlock *HeaderBB = State.CFG.PrevBB;
1645   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1646          "recipe must be in the vector loop header");
1647   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1648   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1649     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1650     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1651     State.set(this, EntryPart, Part);
1652   }
1653 
1654   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1655 
1656   // Reductions do not have to start at zero. They can start with
1657   // any loop invariant values.
1658   VPValue *StartVPV = getStartValue();
1659   Value *StartV = StartVPV->getLiveInIRValue();
1660 
1661   Value *Iden = nullptr;
1662   RecurKind RK = RdxDesc.getRecurrenceKind();
1663   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1664       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1665     // MinMax and AnyOf reductions have the start value as their identity.
1666     if (ScalarPHI) {
1667       Iden = StartV;
1668     } else {
1669       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1670       Builder.SetInsertPoint(VectorPH->getTerminator());
1671       StartV = Iden =
1672           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1673     }
1674   } else {
1675     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1676                                          RdxDesc.getFastMathFlags());
1677 
1678     if (!ScalarPHI) {
1679       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1680       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1681       Builder.SetInsertPoint(VectorPH->getTerminator());
1682       Constant *Zero = Builder.getInt32(0);
1683       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1684     }
1685   }
1686 
1687   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1688     Value *EntryPart = State.get(this, Part);
1689     // Make sure to add the reduction start value only to the
1690     // first unroll part.
1691     Value *StartVal = (Part == 0) ? StartV : Iden;
1692     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1693   }
1694 }
1695 
1696 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1697 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1698                                  VPSlotTracker &SlotTracker) const {
1699   O << Indent << "WIDEN-REDUCTION-PHI ";
1700 
1701   printAsOperand(O, SlotTracker);
1702   O << " = phi ";
1703   printOperands(O, SlotTracker);
1704 }
1705 #endif
1706 
1707 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1708   assert(EnableVPlanNativePath &&
1709          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1710 
1711   Value *Op0 = State.get(getOperand(0), 0);
1712   Type *VecTy = Op0->getType();
1713   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1714   State.set(this, VecPhi, 0);
1715 }
1716 
1717 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1718 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1719                              VPSlotTracker &SlotTracker) const {
1720   O << Indent << "WIDEN-PHI ";
1721 
1722   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1723   // Unless all incoming values are modeled in VPlan  print the original PHI
1724   // directly.
1725   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1726   // values as VPValues.
1727   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1728     O << VPlanIngredient(OriginalPhi);
1729     return;
1730   }
1731 
1732   printAsOperand(O, SlotTracker);
1733   O << " = phi ";
1734   printOperands(O, SlotTracker);
1735 }
1736 #endif
1737 
1738 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1739 // remove VPActiveLaneMaskPHIRecipe.
1740 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1741   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1742   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1743     Value *StartMask = State.get(getOperand(0), Part);
1744     PHINode *EntryPart =
1745         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1746     EntryPart->addIncoming(StartMask, VectorPH);
1747     EntryPart->setDebugLoc(getDebugLoc());
1748     State.set(this, EntryPart, Part);
1749   }
1750 }
1751 
1752 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1753 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1754                                       VPSlotTracker &SlotTracker) const {
1755   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1756 
1757   printAsOperand(O, SlotTracker);
1758   O << " = phi ";
1759   printOperands(O, SlotTracker);
1760 }
1761 #endif
1762