xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 6f1a8c2da278a04565877e277bc4d5b70055ac74)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanUtils.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/IVDescriptors.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instruction.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/Type.h"
27 #include "llvm/IR/Value.h"
28 #include "llvm/IR/VectorBuilder.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
34 #include "llvm/Transforms/Utils/LoopUtils.h"
35 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
36 #include <cassert>
37 
38 using namespace llvm;
39 
40 using VectorParts = SmallVector<Value *, 2>;
41 
42 namespace llvm {
43 extern cl::opt<bool> EnableVPlanNativePath;
44 }
45 extern cl::opt<unsigned> ForceTargetInstructionCost;
46 
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
49 
50 bool VPRecipeBase::mayWriteToMemory() const {
51   switch (getVPDefID()) {
52   case VPInstructionSC:
53     if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
54       return false;
55     switch (cast<VPInstruction>(this)->getOpcode()) {
56     case Instruction::Or:
57     case Instruction::ICmp:
58     case Instruction::Select:
59     case VPInstruction::Not:
60     case VPInstruction::CalculateTripCountMinusVF:
61     case VPInstruction::CanonicalIVIncrementForPart:
62     case VPInstruction::ExtractFromEnd:
63     case VPInstruction::FirstOrderRecurrenceSplice:
64     case VPInstruction::LogicalAnd:
65     case VPInstruction::PtrAdd:
66       return false;
67     default:
68       return true;
69     }
70   case VPInterleaveSC:
71     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
72   case VPWidenStoreEVLSC:
73   case VPWidenStoreSC:
74     return true;
75   case VPReplicateSC:
76     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
77         ->mayWriteToMemory();
78   case VPWidenCallSC:
79     return !cast<VPWidenCallRecipe>(this)
80                 ->getCalledScalarFunction()
81                 ->onlyReadsMemory();
82   case VPBranchOnMaskSC:
83   case VPScalarIVStepsSC:
84   case VPPredInstPHISC:
85     return false;
86   case VPBlendSC:
87   case VPReductionEVLSC:
88   case VPReductionSC:
89   case VPWidenCanonicalIVSC:
90   case VPWidenCastSC:
91   case VPWidenGEPSC:
92   case VPWidenIntOrFpInductionSC:
93   case VPWidenLoadEVLSC:
94   case VPWidenLoadSC:
95   case VPWidenPHISC:
96   case VPWidenSC:
97   case VPWidenEVLSC:
98   case VPWidenSelectSC: {
99     const Instruction *I =
100         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101     (void)I;
102     assert((!I || !I->mayWriteToMemory()) &&
103            "underlying instruction may write to memory");
104     return false;
105   }
106   default:
107     return true;
108   }
109 }
110 
111 bool VPRecipeBase::mayReadFromMemory() const {
112   switch (getVPDefID()) {
113   case VPWidenLoadEVLSC:
114   case VPWidenLoadSC:
115     return true;
116   case VPReplicateSC:
117     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
118         ->mayReadFromMemory();
119   case VPWidenCallSC:
120     return !cast<VPWidenCallRecipe>(this)
121                 ->getCalledScalarFunction()
122                 ->onlyWritesMemory();
123   case VPBranchOnMaskSC:
124   case VPPredInstPHISC:
125   case VPScalarIVStepsSC:
126   case VPWidenStoreEVLSC:
127   case VPWidenStoreSC:
128     return false;
129   case VPBlendSC:
130   case VPReductionEVLSC:
131   case VPReductionSC:
132   case VPWidenCanonicalIVSC:
133   case VPWidenCastSC:
134   case VPWidenGEPSC:
135   case VPWidenIntOrFpInductionSC:
136   case VPWidenPHISC:
137   case VPWidenSC:
138   case VPWidenEVLSC:
139   case VPWidenSelectSC: {
140     const Instruction *I =
141         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
142     (void)I;
143     assert((!I || !I->mayReadFromMemory()) &&
144            "underlying instruction may read from memory");
145     return false;
146   }
147   default:
148     return true;
149   }
150 }
151 
152 bool VPRecipeBase::mayHaveSideEffects() const {
153   switch (getVPDefID()) {
154   case VPDerivedIVSC:
155   case VPPredInstPHISC:
156   case VPScalarCastSC:
157     return false;
158   case VPInstructionSC:
159     return mayWriteToMemory();
160   case VPWidenCallSC: {
161     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
162     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
163   }
164   case VPBlendSC:
165   case VPReductionEVLSC:
166   case VPReductionSC:
167   case VPScalarIVStepsSC:
168   case VPWidenCanonicalIVSC:
169   case VPWidenCastSC:
170   case VPWidenGEPSC:
171   case VPWidenIntOrFpInductionSC:
172   case VPWidenPHISC:
173   case VPWidenPointerInductionSC:
174   case VPWidenSC:
175   case VPWidenEVLSC:
176   case VPWidenSelectSC: {
177     const Instruction *I =
178         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
179     (void)I;
180     assert((!I || !I->mayHaveSideEffects()) &&
181            "underlying instruction has side-effects");
182     return false;
183   }
184   case VPInterleaveSC:
185     return mayWriteToMemory();
186   case VPWidenLoadEVLSC:
187   case VPWidenLoadSC:
188   case VPWidenStoreEVLSC:
189   case VPWidenStoreSC:
190     assert(
191         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
192             mayWriteToMemory() &&
193         "mayHaveSideffects result for ingredient differs from this "
194         "implementation");
195     return mayWriteToMemory();
196   case VPReplicateSC: {
197     auto *R = cast<VPReplicateRecipe>(this);
198     return R->getUnderlyingInstr()->mayHaveSideEffects();
199   }
200   default:
201     return true;
202   }
203 }
204 
205 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
206   VPValue *ExitValue = getOperand(0);
207   VPBasicBlock *MiddleVPBB =
208       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
209   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
210   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
211   // Values leaving the vector loop reach live out phi's in the exiting block
212   // via middle block.
213   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
214                        ? MiddleVPBB
215                        : ExitingVPBB;
216   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
217   Value *V = State.get(ExitValue, VPLane(0));
218   if (Phi->getBasicBlockIndex(PredBB) != -1)
219     Phi->setIncomingValueForBlock(PredBB, V);
220   else
221     Phi->addIncoming(V, PredBB);
222 }
223 
224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
226   O << "Live-out ";
227   getPhi()->printAsOperand(O);
228   O << " = ";
229   getOperand(0)->printAsOperand(O, SlotTracker);
230   O << "\n";
231 }
232 #endif
233 
234 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
235   assert(!Parent && "Recipe already in some VPBasicBlock");
236   assert(InsertPos->getParent() &&
237          "Insertion position not in any VPBasicBlock");
238   InsertPos->getParent()->insert(this, InsertPos->getIterator());
239 }
240 
241 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
242                                 iplist<VPRecipeBase>::iterator I) {
243   assert(!Parent && "Recipe already in some VPBasicBlock");
244   assert(I == BB.end() || I->getParent() == &BB);
245   BB.insert(this, I);
246 }
247 
248 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
249   assert(!Parent && "Recipe already in some VPBasicBlock");
250   assert(InsertPos->getParent() &&
251          "Insertion position not in any VPBasicBlock");
252   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
253 }
254 
255 void VPRecipeBase::removeFromParent() {
256   assert(getParent() && "Recipe not in any VPBasicBlock");
257   getParent()->getRecipeList().remove(getIterator());
258   Parent = nullptr;
259 }
260 
261 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
262   assert(getParent() && "Recipe not in any VPBasicBlock");
263   return getParent()->getRecipeList().erase(getIterator());
264 }
265 
266 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
267   removeFromParent();
268   insertAfter(InsertPos);
269 }
270 
271 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
272                               iplist<VPRecipeBase>::iterator I) {
273   removeFromParent();
274   insertBefore(BB, I);
275 }
276 
277 /// Return the underlying instruction to be used for computing \p R's cost via
278 /// the legacy cost model. Return nullptr if there's no suitable instruction.
279 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
280   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
281     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
282   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
283     return IG->getInsertPos();
284   // Currently the legacy cost model only calculates the instruction cost with
285   // underlying instruction. Removing the WidenMem here will prevent
286   // force-target-instruction-cost overwriting the cost of recipe with
287   // underlying instruction which is inconsistent with the legacy model.
288   // TODO: Remove WidenMem from this function when we don't need to compare to
289   // the legacy model.
290   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
291     return &WidenMem->getIngredient();
292   return nullptr;
293 }
294 
295 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
296   auto *UI = getInstructionForCost(this);
297   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
298     return 0;
299 
300   InstructionCost RecipeCost = computeCost(VF, Ctx);
301   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
302       RecipeCost.isValid())
303     RecipeCost = InstructionCost(ForceTargetInstructionCost);
304 
305   LLVM_DEBUG({
306     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
307     dump();
308   });
309   return RecipeCost;
310 }
311 
312 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
313                                           VPCostContext &Ctx) const {
314   // Compute the cost for the recipe falling back to the legacy cost model using
315   // the underlying instruction. If there is no underlying instruction, returns
316   // 0.
317   Instruction *UI = getInstructionForCost(this);
318   if (UI && isa<VPReplicateRecipe>(this)) {
319     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
320     // transform, avoid computing their cost multiple times for now.
321     Ctx.SkipCostComputation.insert(UI);
322   }
323   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
324 }
325 
326 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
327   assert(OpType == OperationType::FPMathOp &&
328          "recipe doesn't have fast math flags");
329   FastMathFlags Res;
330   Res.setAllowReassoc(FMFs.AllowReassoc);
331   Res.setNoNaNs(FMFs.NoNaNs);
332   Res.setNoInfs(FMFs.NoInfs);
333   Res.setNoSignedZeros(FMFs.NoSignedZeros);
334   Res.setAllowReciprocal(FMFs.AllowReciprocal);
335   Res.setAllowContract(FMFs.AllowContract);
336   Res.setApproxFunc(FMFs.ApproxFunc);
337   return Res;
338 }
339 
340 template <unsigned PartOpIdx>
341 VPValue *
342 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
343   if (U.getNumOperands() == PartOpIdx + 1)
344     return U.getOperand(PartOpIdx);
345   return nullptr;
346 }
347 
348 template <unsigned PartOpIdx>
349 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
350   if (auto *UnrollPartOp = getUnrollPartOperand(U))
351     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
352   return 0;
353 }
354 
355 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
356                              VPValue *A, VPValue *B, DebugLoc DL,
357                              const Twine &Name)
358     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
359                           Pred, DL),
360       Opcode(Opcode), Name(Name.str()) {
361   assert(Opcode == Instruction::ICmp &&
362          "only ICmp predicates supported at the moment");
363 }
364 
365 VPInstruction::VPInstruction(unsigned Opcode,
366                              std::initializer_list<VPValue *> Operands,
367                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
368     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
369       Opcode(Opcode), Name(Name.str()) {
370   // Make sure the VPInstruction is a floating-point operation.
371   assert(isFPMathOp() && "this op can't take fast-math flags");
372 }
373 
374 bool VPInstruction::doesGeneratePerAllLanes() const {
375   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
376 }
377 
378 bool VPInstruction::canGenerateScalarForFirstLane() const {
379   if (Instruction::isBinaryOp(getOpcode()))
380     return true;
381   if (isSingleScalar() || isVectorToScalar())
382     return true;
383   switch (Opcode) {
384   case Instruction::ICmp:
385   case VPInstruction::BranchOnCond:
386   case VPInstruction::BranchOnCount:
387   case VPInstruction::CalculateTripCountMinusVF:
388   case VPInstruction::CanonicalIVIncrementForPart:
389   case VPInstruction::PtrAdd:
390   case VPInstruction::ExplicitVectorLength:
391     return true;
392   default:
393     return false;
394   }
395 }
396 
397 Value *VPInstruction::generatePerLane(VPTransformState &State,
398                                       const VPLane &Lane) {
399   IRBuilderBase &Builder = State.Builder;
400 
401   assert(getOpcode() == VPInstruction::PtrAdd &&
402          "only PtrAdd opcodes are supported for now");
403   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
404                               State.get(getOperand(1), Lane), Name);
405 }
406 
407 Value *VPInstruction::generate(VPTransformState &State) {
408   IRBuilderBase &Builder = State.Builder;
409 
410   if (Instruction::isBinaryOp(getOpcode())) {
411     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
412     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
413     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
414     auto *Res =
415         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
416     if (auto *I = dyn_cast<Instruction>(Res))
417       setFlags(I);
418     return Res;
419   }
420 
421   switch (getOpcode()) {
422   case VPInstruction::Not: {
423     Value *A = State.get(getOperand(0));
424     return Builder.CreateNot(A, Name);
425   }
426   case Instruction::ICmp: {
427     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
428     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
429     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
430     return Builder.CreateCmp(getPredicate(), A, B, Name);
431   }
432   case Instruction::Select: {
433     Value *Cond = State.get(getOperand(0));
434     Value *Op1 = State.get(getOperand(1));
435     Value *Op2 = State.get(getOperand(2));
436     return Builder.CreateSelect(Cond, Op1, Op2, Name);
437   }
438   case VPInstruction::ActiveLaneMask: {
439     // Get first lane of vector induction variable.
440     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
441     // Get the original loop tripcount.
442     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
443 
444     // If this part of the active lane mask is scalar, generate the CMP directly
445     // to avoid unnecessary extracts.
446     if (State.VF.isScalar())
447       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
448                                Name);
449 
450     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
451     auto *PredTy = VectorType::get(Int1Ty, State.VF);
452     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
453                                    {PredTy, ScalarTC->getType()},
454                                    {VIVElem0, ScalarTC}, nullptr, Name);
455   }
456   case VPInstruction::FirstOrderRecurrenceSplice: {
457     // Generate code to combine the previous and current values in vector v3.
458     //
459     //   vector.ph:
460     //     v_init = vector(..., ..., ..., a[-1])
461     //     br vector.body
462     //
463     //   vector.body
464     //     i = phi [0, vector.ph], [i+4, vector.body]
465     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
466     //     v2 = a[i, i+1, i+2, i+3];
467     //     v3 = vector(v1(3), v2(0, 1, 2))
468 
469     auto *V1 = State.get(getOperand(0));
470     if (!V1->getType()->isVectorTy())
471       return V1;
472     Value *V2 = State.get(getOperand(1));
473     return Builder.CreateVectorSplice(V1, V2, -1, Name);
474   }
475   case VPInstruction::CalculateTripCountMinusVF: {
476     unsigned UF = getParent()->getPlan()->getUF();
477     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
478     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
479     Value *Sub = Builder.CreateSub(ScalarTC, Step);
480     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
481     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
482     return Builder.CreateSelect(Cmp, Sub, Zero);
483   }
484   case VPInstruction::ExplicitVectorLength: {
485     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
486     // be outside of the main loop.
487     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
488     // Compute EVL
489     assert(AVL->getType()->isIntegerTy() &&
490            "Requested vector length should be an integer.");
491 
492     assert(State.VF.isScalable() && "Expected scalable vector factor.");
493     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
494 
495     Value *EVL = State.Builder.CreateIntrinsic(
496         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
497         {AVL, VFArg, State.Builder.getTrue()});
498     return EVL;
499   }
500   case VPInstruction::CanonicalIVIncrementForPart: {
501     unsigned Part = getUnrollPart(*this);
502     auto *IV = State.get(getOperand(0), VPLane(0));
503     assert(Part != 0 && "Must have a positive part");
504     // The canonical IV is incremented by the vectorization factor (num of
505     // SIMD elements) times the unroll part.
506     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
507     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
508                              hasNoSignedWrap());
509   }
510   case VPInstruction::BranchOnCond: {
511     Value *Cond = State.get(getOperand(0), VPLane(0));
512     // Replace the temporary unreachable terminator with a new conditional
513     // branch, hooking it up to backward destination for exiting blocks now and
514     // to forward destination(s) later when they are created.
515     BranchInst *CondBr =
516         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
517     CondBr->setSuccessor(0, nullptr);
518     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
519 
520     if (!getParent()->isExiting())
521       return CondBr;
522 
523     VPRegionBlock *ParentRegion = getParent()->getParent();
524     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
525     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
526     return CondBr;
527   }
528   case VPInstruction::BranchOnCount: {
529     // First create the compare.
530     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
531     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
532     Value *Cond = Builder.CreateICmpEQ(IV, TC);
533 
534     // Now create the branch.
535     auto *Plan = getParent()->getPlan();
536     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
537     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
538 
539     // Replace the temporary unreachable terminator with a new conditional
540     // branch, hooking it up to backward destination (the header) now and to the
541     // forward destination (the exit/middle block) later when it is created.
542     // Note that CreateCondBr expects a valid BB as first argument, so we need
543     // to set it to nullptr later.
544     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
545                                               State.CFG.VPBB2IRBB[Header]);
546     CondBr->setSuccessor(0, nullptr);
547     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
548     return CondBr;
549   }
550   case VPInstruction::ComputeReductionResult: {
551     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
552     // and will be removed by breaking up the recipe further.
553     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
554     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
555     // Get its reduction variable descriptor.
556     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
557 
558     RecurKind RK = RdxDesc.getRecurrenceKind();
559 
560     Type *PhiTy = OrigPhi->getType();
561     // The recipe's operands are the reduction phi, followed by one operand for
562     // each part of the reduction.
563     unsigned UF = getNumOperands() - 1;
564     VectorParts RdxParts(UF);
565     for (unsigned Part = 0; Part < UF; ++Part)
566       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
567 
568     // If the vector reduction can be performed in a smaller type, we truncate
569     // then extend the loop exit value to enable InstCombine to evaluate the
570     // entire expression in the smaller type.
571     // TODO: Handle this in truncateToMinBW.
572     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
573       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
574       for (unsigned Part = 0; Part < UF; ++Part)
575         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
576     }
577     // Reduce all of the unrolled parts into a single vector.
578     Value *ReducedPartRdx = RdxParts[0];
579     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
580     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
581       Op = Instruction::Or;
582 
583     if (PhiR->isOrdered()) {
584       ReducedPartRdx = RdxParts[UF - 1];
585     } else {
586       // Floating-point operations should have some FMF to enable the reduction.
587       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
588       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
589       for (unsigned Part = 1; Part < UF; ++Part) {
590         Value *RdxPart = RdxParts[Part];
591         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
592           ReducedPartRdx = Builder.CreateBinOp(
593               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
594         else
595           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
596       }
597     }
598 
599     // Create the reduction after the loop. Note that inloop reductions create
600     // the target reduction in the loop using a Reduction recipe.
601     if ((State.VF.isVector() ||
602          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
603         !PhiR->isInLoop()) {
604       ReducedPartRdx =
605           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
606       // If the reduction can be performed in a smaller type, we need to extend
607       // the reduction to the wider type before we branch to the original loop.
608       if (PhiTy != RdxDesc.getRecurrenceType())
609         ReducedPartRdx = RdxDesc.isSigned()
610                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
611                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
612     }
613 
614     // If there were stores of the reduction value to a uniform memory address
615     // inside the loop, create the final store here.
616     if (StoreInst *SI = RdxDesc.IntermediateStore) {
617       auto *NewSI = Builder.CreateAlignedStore(
618           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
619       propagateMetadata(NewSI, SI);
620     }
621 
622     return ReducedPartRdx;
623   }
624   case VPInstruction::ExtractFromEnd: {
625     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
626     unsigned Offset = CI->getZExtValue();
627     assert(Offset > 0 && "Offset from end must be positive");
628     Value *Res;
629     if (State.VF.isVector()) {
630       assert(Offset <= State.VF.getKnownMinValue() &&
631              "invalid offset to extract from");
632       // Extract lane VF - Offset from the operand.
633       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
634     } else {
635       assert(Offset <= 1 && "invalid offset to extract from");
636       Res = State.get(getOperand(0));
637     }
638     if (isa<ExtractElementInst>(Res))
639       Res->setName(Name);
640     return Res;
641   }
642   case VPInstruction::LogicalAnd: {
643     Value *A = State.get(getOperand(0));
644     Value *B = State.get(getOperand(1));
645     return Builder.CreateLogicalAnd(A, B, Name);
646   }
647   case VPInstruction::PtrAdd: {
648     assert(vputils::onlyFirstLaneUsed(this) &&
649            "can only generate first lane for PtrAdd");
650     Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
651     Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
652     return Builder.CreatePtrAdd(Ptr, Addend, Name);
653   }
654   case VPInstruction::ResumePhi: {
655     Value *IncomingFromVPlanPred =
656         State.get(getOperand(0), /* IsScalar */ true);
657     Value *IncomingFromOtherPreds =
658         State.get(getOperand(1), /* IsScalar */ true);
659     auto *NewPhi =
660         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
661     BasicBlock *VPlanPred =
662         State.CFG
663             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
664     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
665     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
666       assert(OtherPred != VPlanPred &&
667              "VPlan predecessors should not be connected yet");
668       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
669     }
670     return NewPhi;
671   }
672 
673   default:
674     llvm_unreachable("Unsupported opcode for instruction");
675   }
676 }
677 
678 bool VPInstruction::isVectorToScalar() const {
679   return getOpcode() == VPInstruction::ExtractFromEnd ||
680          getOpcode() == VPInstruction::ComputeReductionResult;
681 }
682 
683 bool VPInstruction::isSingleScalar() const {
684   return getOpcode() == VPInstruction::ResumePhi;
685 }
686 
687 #if !defined(NDEBUG)
688 bool VPInstruction::isFPMathOp() const {
689   // Inspired by FPMathOperator::classof. Notable differences are that we don't
690   // support Call, PHI and Select opcodes here yet.
691   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
692          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
693          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
694          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
695 }
696 #endif
697 
698 void VPInstruction::execute(VPTransformState &State) {
699   assert(!State.Lane && "VPInstruction executing an Lane");
700   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
701   assert((hasFastMathFlags() == isFPMathOp() ||
702           getOpcode() == Instruction::Select) &&
703          "Recipe not a FPMathOp but has fast-math flags?");
704   if (hasFastMathFlags())
705     State.Builder.setFastMathFlags(getFastMathFlags());
706   State.setDebugLocFrom(getDebugLoc());
707   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
708                                    (vputils::onlyFirstLaneUsed(this) ||
709                                     isVectorToScalar() || isSingleScalar());
710   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
711   if (GeneratesPerAllLanes) {
712     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
713          Lane != NumLanes; ++Lane) {
714       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
715       assert(GeneratedValue && "generatePerLane must produce a value");
716       State.set(this, GeneratedValue, VPLane(Lane));
717     }
718     return;
719   }
720 
721   Value *GeneratedValue = generate(State);
722   if (!hasResult())
723     return;
724   assert(GeneratedValue && "generate must produce a value");
725   assert(
726       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
727        State.VF.isScalar()) &&
728       "scalar value but not only first lane defined");
729   State.set(this, GeneratedValue,
730             /*IsScalar*/ GeneratesPerFirstLaneOnly);
731 }
732 
733 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
734   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
735   if (Instruction::isBinaryOp(getOpcode()))
736     return vputils::onlyFirstLaneUsed(this);
737 
738   switch (getOpcode()) {
739   default:
740     return false;
741   case Instruction::ICmp:
742   case VPInstruction::PtrAdd:
743     // TODO: Cover additional opcodes.
744     return vputils::onlyFirstLaneUsed(this);
745   case VPInstruction::ActiveLaneMask:
746   case VPInstruction::ExplicitVectorLength:
747   case VPInstruction::CalculateTripCountMinusVF:
748   case VPInstruction::CanonicalIVIncrementForPart:
749   case VPInstruction::BranchOnCount:
750   case VPInstruction::BranchOnCond:
751   case VPInstruction::ResumePhi:
752     return true;
753   };
754   llvm_unreachable("switch should return");
755 }
756 
757 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
758   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
759   if (Instruction::isBinaryOp(getOpcode()))
760     return vputils::onlyFirstPartUsed(this);
761 
762   switch (getOpcode()) {
763   default:
764     return false;
765   case Instruction::ICmp:
766   case Instruction::Select:
767     return vputils::onlyFirstPartUsed(this);
768   case VPInstruction::BranchOnCount:
769   case VPInstruction::BranchOnCond:
770   case VPInstruction::CanonicalIVIncrementForPart:
771     return true;
772   };
773   llvm_unreachable("switch should return");
774 }
775 
776 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
777 void VPInstruction::dump() const {
778   VPSlotTracker SlotTracker(getParent()->getPlan());
779   print(dbgs(), "", SlotTracker);
780 }
781 
782 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
783                           VPSlotTracker &SlotTracker) const {
784   O << Indent << "EMIT ";
785 
786   if (hasResult()) {
787     printAsOperand(O, SlotTracker);
788     O << " = ";
789   }
790 
791   switch (getOpcode()) {
792   case VPInstruction::Not:
793     O << "not";
794     break;
795   case VPInstruction::SLPLoad:
796     O << "combined load";
797     break;
798   case VPInstruction::SLPStore:
799     O << "combined store";
800     break;
801   case VPInstruction::ActiveLaneMask:
802     O << "active lane mask";
803     break;
804   case VPInstruction::ResumePhi:
805     O << "resume-phi";
806     break;
807   case VPInstruction::ExplicitVectorLength:
808     O << "EXPLICIT-VECTOR-LENGTH";
809     break;
810   case VPInstruction::FirstOrderRecurrenceSplice:
811     O << "first-order splice";
812     break;
813   case VPInstruction::BranchOnCond:
814     O << "branch-on-cond";
815     break;
816   case VPInstruction::CalculateTripCountMinusVF:
817     O << "TC > VF ? TC - VF : 0";
818     break;
819   case VPInstruction::CanonicalIVIncrementForPart:
820     O << "VF * Part +";
821     break;
822   case VPInstruction::BranchOnCount:
823     O << "branch-on-count";
824     break;
825   case VPInstruction::ExtractFromEnd:
826     O << "extract-from-end";
827     break;
828   case VPInstruction::ComputeReductionResult:
829     O << "compute-reduction-result";
830     break;
831   case VPInstruction::LogicalAnd:
832     O << "logical-and";
833     break;
834   case VPInstruction::PtrAdd:
835     O << "ptradd";
836     break;
837   default:
838     O << Instruction::getOpcodeName(getOpcode());
839   }
840 
841   printFlags(O);
842   printOperands(O, SlotTracker);
843 
844   if (auto DL = getDebugLoc()) {
845     O << ", !dbg ";
846     DL.print(O);
847   }
848 }
849 #endif
850 
851 void VPIRInstruction::execute(VPTransformState &State) {
852   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
853          "Only PHINodes can have extra operands");
854   if (getNumOperands() == 1) {
855     VPValue *ExitValue = getOperand(0);
856     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
857                     ? VPLane::getFirstLane()
858                     : VPLane::getLastLaneForVF(State.VF);
859     auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
860     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
861     // Set insertion point in PredBB in case an extract needs to be generated.
862     // TODO: Model extracts explicitly.
863     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
864     Value *V = State.get(ExitValue, VPLane(Lane));
865     auto *Phi = cast<PHINode>(&I);
866     Phi->addIncoming(V, PredBB);
867   }
868 
869   // Advance the insert point after the wrapped IR instruction. This allows
870   // interleaving VPIRInstructions and other recipes.
871   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
872 }
873 
874 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
875 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
876                             VPSlotTracker &SlotTracker) const {
877   O << Indent << "IR " << I;
878 
879   if (getNumOperands() != 0) {
880     assert(getNumOperands() == 1 && "can have at most 1 operand");
881     O << " (extra operand: ";
882     printOperands(O, SlotTracker);
883     O << ")";
884   }
885 }
886 #endif
887 
888 void VPWidenCallRecipe::execute(VPTransformState &State) {
889   assert(State.VF.isVector() && "not widening");
890   Function *CalledScalarFn = getCalledScalarFunction();
891   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
892          "DbgInfoIntrinsic should have been dropped during VPlan construction");
893   State.setDebugLocFrom(getDebugLoc());
894 
895   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
896   FunctionType *VFTy = nullptr;
897   if (Variant)
898     VFTy = Variant->getFunctionType();
899   SmallVector<Type *, 2> TysForDecl;
900   // Add return type if intrinsic is overloaded on it.
901   if (UseIntrinsic &&
902       isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
903     TysForDecl.push_back(VectorType::get(
904         CalledScalarFn->getReturnType()->getScalarType(), State.VF));
905   SmallVector<Value *, 4> Args;
906   for (const auto &I : enumerate(arg_operands())) {
907     // Some intrinsics have a scalar argument - don't replace it with a
908     // vector.
909     Value *Arg;
910     if (UseIntrinsic &&
911         isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
912       Arg = State.get(I.value(), VPLane(0));
913     // Some vectorized function variants may also take a scalar argument,
914     // e.g. linear parameters for pointers. This needs to be the scalar value
915     // from the start of the respective part when interleaving.
916     else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
917       Arg = State.get(I.value(), VPLane(0));
918     else
919       Arg = State.get(I.value());
920     if (UseIntrinsic &&
921         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
922       TysForDecl.push_back(Arg->getType());
923     Args.push_back(Arg);
924   }
925 
926   Function *VectorF;
927   if (UseIntrinsic) {
928     // Use vector version of the intrinsic.
929     Module *M = State.Builder.GetInsertBlock()->getModule();
930     VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
931     assert(VectorF && "Can't retrieve vector intrinsic.");
932   } else {
933 #ifndef NDEBUG
934     assert(Variant != nullptr && "Can't create vector function.");
935 #endif
936     VectorF = Variant;
937   }
938 
939   auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
940   SmallVector<OperandBundleDef, 1> OpBundles;
941   if (CI)
942     CI->getOperandBundlesAsDefs(OpBundles);
943 
944   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
945 
946   if (isa<FPMathOperator>(V))
947     V->copyFastMathFlags(CI);
948 
949   if (!V->getType()->isVoidTy())
950     State.set(this, V);
951   State.addMetadata(V, CI);
952 }
953 
954 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
955                                                VPCostContext &Ctx) const {
956   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
957   if (Variant) {
958     return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
959                                     Variant->getFunctionType()->params(),
960                                     CostKind);
961   }
962 
963   FastMathFlags FMF;
964   // TODO: Manage flags via VPRecipeWithIRFlags.
965   if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
966     FMF = FPMO->getFastMathFlags();
967 
968   // Some backends analyze intrinsic arguments to determine cost. Use the
969   // underlying value for the operand if it has one. Otherwise try to use the
970   // operand of the underlying call instruction, if there is one. Otherwise
971   // clear Arguments.
972   // TODO: Rework TTI interface to be independent of concrete IR values.
973   SmallVector<const Value *> Arguments;
974   for (const auto &[Idx, Op] : enumerate(operands())) {
975     auto *V = Op->getUnderlyingValue();
976     if (!V) {
977       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
978         Arguments.push_back(UI->getArgOperand(Idx));
979         continue;
980       }
981       Arguments.clear();
982       break;
983     }
984     Arguments.push_back(V);
985   }
986 
987   Type *RetTy =
988       ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
989   SmallVector<Type *> ParamTys;
990   for (unsigned I = 0; I != getNumOperands(); ++I)
991     ParamTys.push_back(
992         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
993 
994   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
995   IntrinsicCostAttributes CostAttrs(
996       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
997       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
998   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
999 }
1000 
1001 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1002 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1003                               VPSlotTracker &SlotTracker) const {
1004   O << Indent << "WIDEN-CALL ";
1005 
1006   Function *CalledFn = getCalledScalarFunction();
1007   if (CalledFn->getReturnType()->isVoidTy())
1008     O << "void ";
1009   else {
1010     printAsOperand(O, SlotTracker);
1011     O << " = ";
1012   }
1013 
1014   O << "call @" << CalledFn->getName() << "(";
1015   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1016     Op->printAsOperand(O, SlotTracker);
1017   });
1018   O << ")";
1019 
1020   if (VectorIntrinsicID)
1021     O << " (using vector intrinsic)";
1022   else {
1023     O << " (using library function";
1024     if (Variant->hasName())
1025       O << ": " << Variant->getName();
1026     O << ")";
1027   }
1028 }
1029 #endif
1030 
1031 void VPHistogramRecipe::execute(VPTransformState &State) {
1032   State.setDebugLocFrom(getDebugLoc());
1033   IRBuilderBase &Builder = State.Builder;
1034 
1035   Value *Address = State.get(getOperand(0));
1036   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1037   VectorType *VTy = cast<VectorType>(Address->getType());
1038 
1039   // The histogram intrinsic requires a mask even if the recipe doesn't;
1040   // if the mask operand was omitted then all lanes should be executed and
1041   // we just need to synthesize an all-true mask.
1042   Value *Mask = nullptr;
1043   if (VPValue *VPMask = getMask())
1044     Mask = State.get(VPMask);
1045   else
1046     Mask =
1047         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1048 
1049   // If this is a subtract, we want to invert the increment amount. We may
1050   // add a separate intrinsic in future, but for now we'll try this.
1051   if (Opcode == Instruction::Sub)
1052     IncAmt = Builder.CreateNeg(IncAmt);
1053   else
1054     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1055 
1056   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1057                                 {VTy, IncAmt->getType()},
1058                                 {Address, IncAmt, Mask});
1059 }
1060 
1061 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1062                                                VPCostContext &Ctx) const {
1063   // FIXME: Take the gather and scatter into account as well. For now we're
1064   //        generating the same cost as the fallback path, but we'll likely
1065   //        need to create a new TTI method for determining the cost, including
1066   //        whether we can use base + vec-of-smaller-indices or just
1067   //        vec-of-pointers.
1068   assert(VF.isVector() && "Invalid VF for histogram cost");
1069   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1070   VPValue *IncAmt = getOperand(1);
1071   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1072   VectorType *VTy = VectorType::get(IncTy, VF);
1073 
1074   // Assume that a non-constant update value (or a constant != 1) requires
1075   // a multiply, and add that into the cost.
1076   InstructionCost MulCost =
1077       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
1078   if (IncAmt->isLiveIn()) {
1079     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1080 
1081     if (CI && CI->getZExtValue() == 1)
1082       MulCost = TTI::TCC_Free;
1083   }
1084 
1085   // Find the cost of the histogram operation itself.
1086   Type *PtrTy = VectorType::get(AddressTy, VF);
1087   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1088   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1089                               Type::getVoidTy(Ctx.LLVMCtx),
1090                               {PtrTy, IncTy, MaskTy});
1091 
1092   // Add the costs together with the add/sub operation.
1093   return Ctx.TTI.getIntrinsicInstrCost(
1094              ICA, TargetTransformInfo::TCK_RecipThroughput) +
1095          MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
1096 }
1097 
1098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1099 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1100                               VPSlotTracker &SlotTracker) const {
1101   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1102   getOperand(0)->printAsOperand(O, SlotTracker);
1103 
1104   if (Opcode == Instruction::Sub)
1105     O << ", dec: ";
1106   else {
1107     assert(Opcode == Instruction::Add);
1108     O << ", inc: ";
1109   }
1110   getOperand(1)->printAsOperand(O, SlotTracker);
1111 
1112   if (VPValue *Mask = getMask()) {
1113     O << ", mask: ";
1114     Mask->printAsOperand(O, SlotTracker);
1115   }
1116 }
1117 
1118 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1119                                 VPSlotTracker &SlotTracker) const {
1120   O << Indent << "WIDEN-SELECT ";
1121   printAsOperand(O, SlotTracker);
1122   O << " = select ";
1123   getOperand(0)->printAsOperand(O, SlotTracker);
1124   O << ", ";
1125   getOperand(1)->printAsOperand(O, SlotTracker);
1126   O << ", ";
1127   getOperand(2)->printAsOperand(O, SlotTracker);
1128   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1129 }
1130 #endif
1131 
1132 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1133   State.setDebugLocFrom(getDebugLoc());
1134 
1135   // The condition can be loop invariant but still defined inside the
1136   // loop. This means that we can't just use the original 'cond' value.
1137   // We have to take the 'vectorized' value and pick the first lane.
1138   // Instcombine will make this a no-op.
1139   auto *InvarCond =
1140       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1141 
1142   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1143   Value *Op0 = State.get(getOperand(1));
1144   Value *Op1 = State.get(getOperand(2));
1145   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1146   State.set(this, Sel);
1147   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1148 }
1149 
1150 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1151     const FastMathFlags &FMF) {
1152   AllowReassoc = FMF.allowReassoc();
1153   NoNaNs = FMF.noNaNs();
1154   NoInfs = FMF.noInfs();
1155   NoSignedZeros = FMF.noSignedZeros();
1156   AllowReciprocal = FMF.allowReciprocal();
1157   AllowContract = FMF.allowContract();
1158   ApproxFunc = FMF.approxFunc();
1159 }
1160 
1161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1162 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1163   switch (OpType) {
1164   case OperationType::Cmp:
1165     O << " " << CmpInst::getPredicateName(getPredicate());
1166     break;
1167   case OperationType::DisjointOp:
1168     if (DisjointFlags.IsDisjoint)
1169       O << " disjoint";
1170     break;
1171   case OperationType::PossiblyExactOp:
1172     if (ExactFlags.IsExact)
1173       O << " exact";
1174     break;
1175   case OperationType::OverflowingBinOp:
1176     if (WrapFlags.HasNUW)
1177       O << " nuw";
1178     if (WrapFlags.HasNSW)
1179       O << " nsw";
1180     break;
1181   case OperationType::FPMathOp:
1182     getFastMathFlags().print(O);
1183     break;
1184   case OperationType::GEPOp:
1185     if (GEPFlags.IsInBounds)
1186       O << " inbounds";
1187     break;
1188   case OperationType::NonNegOp:
1189     if (NonNegFlags.NonNeg)
1190       O << " nneg";
1191     break;
1192   case OperationType::Other:
1193     break;
1194   }
1195   if (getNumOperands() > 0)
1196     O << " ";
1197 }
1198 #endif
1199 
1200 void VPWidenRecipe::execute(VPTransformState &State) {
1201   State.setDebugLocFrom(getDebugLoc());
1202   auto &Builder = State.Builder;
1203   switch (Opcode) {
1204   case Instruction::Call:
1205   case Instruction::Br:
1206   case Instruction::PHI:
1207   case Instruction::GetElementPtr:
1208   case Instruction::Select:
1209     llvm_unreachable("This instruction is handled by a different recipe.");
1210   case Instruction::UDiv:
1211   case Instruction::SDiv:
1212   case Instruction::SRem:
1213   case Instruction::URem:
1214   case Instruction::Add:
1215   case Instruction::FAdd:
1216   case Instruction::Sub:
1217   case Instruction::FSub:
1218   case Instruction::FNeg:
1219   case Instruction::Mul:
1220   case Instruction::FMul:
1221   case Instruction::FDiv:
1222   case Instruction::FRem:
1223   case Instruction::Shl:
1224   case Instruction::LShr:
1225   case Instruction::AShr:
1226   case Instruction::And:
1227   case Instruction::Or:
1228   case Instruction::Xor: {
1229     // Just widen unops and binops.
1230     SmallVector<Value *, 2> Ops;
1231     for (VPValue *VPOp : operands())
1232       Ops.push_back(State.get(VPOp));
1233 
1234     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1235 
1236     if (auto *VecOp = dyn_cast<Instruction>(V))
1237       setFlags(VecOp);
1238 
1239     // Use this vector value for all users of the original instruction.
1240     State.set(this, V);
1241     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1242     break;
1243   }
1244   case Instruction::Freeze: {
1245     Value *Op = State.get(getOperand(0));
1246 
1247     Value *Freeze = Builder.CreateFreeze(Op);
1248     State.set(this, Freeze);
1249     break;
1250   }
1251   case Instruction::ICmp:
1252   case Instruction::FCmp: {
1253     // Widen compares. Generate vector compares.
1254     bool FCmp = Opcode == Instruction::FCmp;
1255     Value *A = State.get(getOperand(0));
1256     Value *B = State.get(getOperand(1));
1257     Value *C = nullptr;
1258     if (FCmp) {
1259       // Propagate fast math flags.
1260       IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1261       if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1262         Builder.setFastMathFlags(I->getFastMathFlags());
1263       C = Builder.CreateFCmp(getPredicate(), A, B);
1264     } else {
1265       C = Builder.CreateICmp(getPredicate(), A, B);
1266     }
1267     State.set(this, C);
1268     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1269     break;
1270   }
1271   default:
1272     // This instruction is not vectorized by simple widening.
1273     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1274                       << Instruction::getOpcodeName(Opcode));
1275     llvm_unreachable("Unhandled instruction!");
1276   } // end of switch.
1277 
1278 #if !defined(NDEBUG)
1279   // Verify that VPlan type inference results agree with the type of the
1280   // generated values.
1281   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1282              State.get(this)->getType() &&
1283          "inferred type and type from generated instructions do not match");
1284 #endif
1285 }
1286 
1287 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1288                                            VPCostContext &Ctx) const {
1289   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1290   switch (Opcode) {
1291   case Instruction::FNeg: {
1292     Type *VectorTy =
1293         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1294     return Ctx.TTI.getArithmeticInstrCost(
1295         Opcode, VectorTy, CostKind,
1296         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1297         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1298   }
1299 
1300   case Instruction::UDiv:
1301   case Instruction::SDiv:
1302   case Instruction::SRem:
1303   case Instruction::URem:
1304     // More complex computation, let the legacy cost-model handle this for now.
1305     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1306   case Instruction::Add:
1307   case Instruction::FAdd:
1308   case Instruction::Sub:
1309   case Instruction::FSub:
1310   case Instruction::Mul:
1311   case Instruction::FMul:
1312   case Instruction::FDiv:
1313   case Instruction::FRem:
1314   case Instruction::Shl:
1315   case Instruction::LShr:
1316   case Instruction::AShr:
1317   case Instruction::And:
1318   case Instruction::Or:
1319   case Instruction::Xor: {
1320     VPValue *RHS = getOperand(1);
1321     // Certain instructions can be cheaper to vectorize if they have a constant
1322     // second vector operand. One example of this are shifts on x86.
1323     TargetTransformInfo::OperandValueInfo RHSInfo = {
1324         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1325     if (RHS->isLiveIn())
1326       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1327 
1328     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1329         getOperand(1)->isDefinedOutsideLoopRegions())
1330       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1331     Type *VectorTy =
1332         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1333     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1334 
1335     SmallVector<const Value *, 4> Operands;
1336     if (CtxI)
1337       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1338     return Ctx.TTI.getArithmeticInstrCost(
1339         Opcode, VectorTy, CostKind,
1340         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1341         RHSInfo, Operands, CtxI, &Ctx.TLI);
1342   }
1343   case Instruction::Freeze: {
1344     // This opcode is unknown. Assume that it is the same as 'mul'.
1345     Type *VectorTy =
1346         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1347     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1348   }
1349   case Instruction::ICmp:
1350   case Instruction::FCmp: {
1351     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1352     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1353     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1354                                       CostKind,
1355                                       {TTI::OK_AnyValue, TTI::OP_None},
1356                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1357   }
1358   default:
1359     llvm_unreachable("Unsupported opcode for instruction");
1360   }
1361 }
1362 
1363 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1364   unsigned Opcode = getOpcode();
1365   // TODO: Support other opcodes
1366   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1367     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1368 
1369   State.setDebugLocFrom(getDebugLoc());
1370 
1371   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1372          "VPWidenEVLRecipe should not be used for scalars");
1373 
1374   VPValue *EVL = getEVL();
1375   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1376   IRBuilderBase &BuilderIR = State.Builder;
1377   VectorBuilder Builder(BuilderIR);
1378   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1379 
1380   SmallVector<Value *, 4> Ops;
1381   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1382     VPValue *VPOp = getOperand(I);
1383     Ops.push_back(State.get(VPOp));
1384   }
1385 
1386   Builder.setMask(Mask).setEVL(EVLArg);
1387   Value *VPInst =
1388       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1389   // Currently vp-intrinsics only accept FMF flags.
1390   // TODO: Enable other flags when support is added.
1391   if (isa<FPMathOperator>(VPInst))
1392     setFlags(cast<Instruction>(VPInst));
1393 
1394   State.set(this, VPInst);
1395   State.addMetadata(VPInst,
1396                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1397 }
1398 
1399 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1400 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1401                           VPSlotTracker &SlotTracker) const {
1402   O << Indent << "WIDEN ";
1403   printAsOperand(O, SlotTracker);
1404   O << " = " << Instruction::getOpcodeName(Opcode);
1405   printFlags(O);
1406   printOperands(O, SlotTracker);
1407 }
1408 
1409 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1410                              VPSlotTracker &SlotTracker) const {
1411   O << Indent << "WIDEN ";
1412   printAsOperand(O, SlotTracker);
1413   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1414   printFlags(O);
1415   printOperands(O, SlotTracker);
1416 }
1417 #endif
1418 
1419 void VPWidenCastRecipe::execute(VPTransformState &State) {
1420   State.setDebugLocFrom(getDebugLoc());
1421   auto &Builder = State.Builder;
1422   /// Vectorize casts.
1423   assert(State.VF.isVector() && "Not vectorizing?");
1424   Type *DestTy = VectorType::get(getResultType(), State.VF);
1425   VPValue *Op = getOperand(0);
1426   Value *A = State.get(Op);
1427   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1428   State.set(this, Cast);
1429   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1430 }
1431 
1432 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1433 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1434                               VPSlotTracker &SlotTracker) const {
1435   O << Indent << "WIDEN-CAST ";
1436   printAsOperand(O, SlotTracker);
1437   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1438   printFlags(O);
1439   printOperands(O, SlotTracker);
1440   O << " to " << *getResultType();
1441 }
1442 #endif
1443 
1444 /// This function adds
1445 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1446 /// to each vector element of Val. The sequence starts at StartIndex.
1447 /// \p Opcode is relevant for FP induction variable.
1448 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1449                             Instruction::BinaryOps BinOp, ElementCount VF,
1450                             IRBuilderBase &Builder) {
1451   assert(VF.isVector() && "only vector VFs are supported");
1452 
1453   // Create and check the types.
1454   auto *ValVTy = cast<VectorType>(Val->getType());
1455   ElementCount VLen = ValVTy->getElementCount();
1456 
1457   Type *STy = Val->getType()->getScalarType();
1458   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1459          "Induction Step must be an integer or FP");
1460   assert(Step->getType() == STy && "Step has wrong type");
1461 
1462   SmallVector<Constant *, 8> Indices;
1463 
1464   // Create a vector of consecutive numbers from zero to VF.
1465   VectorType *InitVecValVTy = ValVTy;
1466   if (STy->isFloatingPointTy()) {
1467     Type *InitVecValSTy =
1468         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1469     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1470   }
1471   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1472 
1473   // Splat the StartIdx
1474   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1475 
1476   if (STy->isIntegerTy()) {
1477     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1478     Step = Builder.CreateVectorSplat(VLen, Step);
1479     assert(Step->getType() == Val->getType() && "Invalid step vec");
1480     // FIXME: The newly created binary instructions should contain nsw/nuw
1481     // flags, which can be found from the original scalar operations.
1482     Step = Builder.CreateMul(InitVec, Step);
1483     return Builder.CreateAdd(Val, Step, "induction");
1484   }
1485 
1486   // Floating point induction.
1487   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1488          "Binary Opcode should be specified for FP induction");
1489   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1490   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1491 
1492   Step = Builder.CreateVectorSplat(VLen, Step);
1493   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1494   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1495 }
1496 
1497 /// A helper function that returns an integer or floating-point constant with
1498 /// value C.
1499 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1500   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1501                            : ConstantFP::get(Ty, C);
1502 }
1503 
1504 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1505   assert(!State.Lane && "Int or FP induction being replicated.");
1506 
1507   Value *Start = getStartValue()->getLiveInIRValue();
1508   const InductionDescriptor &ID = getInductionDescriptor();
1509   TruncInst *Trunc = getTruncInst();
1510   IRBuilderBase &Builder = State.Builder;
1511   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1512   assert(State.VF.isVector() && "must have vector VF");
1513 
1514   // The value from the original loop to which we are mapping the new induction
1515   // variable.
1516   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1517 
1518   // Fast-math-flags propagate from the original induction instruction.
1519   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1520   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1521     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1522 
1523   // Now do the actual transformations, and start with fetching the step value.
1524   Value *Step = State.get(getStepValue(), VPLane(0));
1525 
1526   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1527          "Expected either an induction phi-node or a truncate of it!");
1528 
1529   // Construct the initial value of the vector IV in the vector loop preheader
1530   auto CurrIP = Builder.saveIP();
1531   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1532   Builder.SetInsertPoint(VectorPH->getTerminator());
1533   if (isa<TruncInst>(EntryVal)) {
1534     assert(Start->getType()->isIntegerTy() &&
1535            "Truncation requires an integer type");
1536     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1537     Step = Builder.CreateTrunc(Step, TruncType);
1538     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1539   }
1540 
1541   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1542   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1543   Value *SteppedStart = getStepVector(
1544       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1545 
1546   // We create vector phi nodes for both integer and floating-point induction
1547   // variables. Here, we determine the kind of arithmetic we will perform.
1548   Instruction::BinaryOps AddOp;
1549   Instruction::BinaryOps MulOp;
1550   if (Step->getType()->isIntegerTy()) {
1551     AddOp = Instruction::Add;
1552     MulOp = Instruction::Mul;
1553   } else {
1554     AddOp = ID.getInductionOpcode();
1555     MulOp = Instruction::FMul;
1556   }
1557 
1558   Value *SplatVF;
1559   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1560     // The recipe has been unrolled. In that case, fetch the splat value for the
1561     // induction increment.
1562     SplatVF = State.get(SplatVFOperand);
1563   } else {
1564     // Multiply the vectorization factor by the step using integer or
1565     // floating-point arithmetic as appropriate.
1566     Type *StepType = Step->getType();
1567     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1568     if (Step->getType()->isFloatingPointTy())
1569       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1570     else
1571       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1572     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1573 
1574     // Create a vector splat to use in the induction update.
1575     //
1576     // FIXME: If the step is non-constant, we create the vector splat with
1577     //        IRBuilder. IRBuilder can constant-fold the multiply, but it
1578     //        doesn't handle a constant vector splat.
1579     SplatVF = isa<Constant>(Mul)
1580                   ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1581                   : Builder.CreateVectorSplat(State.VF, Mul);
1582   }
1583 
1584   Builder.restoreIP(CurrIP);
1585 
1586   // We may need to add the step a number of times, depending on the unroll
1587   // factor. The last of those goes into the PHI.
1588   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1589   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1590   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1591   State.set(this, VecInd);
1592 
1593   Instruction *LastInduction = cast<Instruction>(
1594       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1595   if (isa<TruncInst>(EntryVal))
1596     State.addMetadata(LastInduction, EntryVal);
1597   LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1598 
1599   VecInd->addIncoming(SteppedStart, VectorPH);
1600   // Add induction update using an incorrect block temporarily. The phi node
1601   // will be fixed after VPlan execution. Note that at this point the latch
1602   // block cannot be used, as it does not exist yet.
1603   // TODO: Model increment value in VPlan, by turning the recipe into a
1604   // multi-def and a subclass of VPHeaderPHIRecipe.
1605   VecInd->addIncoming(LastInduction, VectorPH);
1606 }
1607 
1608 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1609 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1610                                           VPSlotTracker &SlotTracker) const {
1611   O << Indent << "WIDEN-INDUCTION";
1612   if (getTruncInst()) {
1613     O << "\\l\"";
1614     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1615     O << " +\n" << Indent << "\"  ";
1616     getVPValue(0)->printAsOperand(O, SlotTracker);
1617   } else
1618     O << " " << VPlanIngredient(IV);
1619 
1620   O << ", ";
1621   getStepValue()->printAsOperand(O, SlotTracker);
1622 
1623   O << ", ";
1624   getVFValue()->printAsOperand(O, SlotTracker);
1625 }
1626 #endif
1627 
1628 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1629   // The step may be defined by a recipe in the preheader (e.g. if it requires
1630   // SCEV expansion), but for the canonical induction the step is required to be
1631   // 1, which is represented as live-in.
1632   if (getStepValue()->getDefiningRecipe())
1633     return false;
1634   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1635   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1636   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1637   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1638          getScalarType() == CanIV->getScalarType();
1639 }
1640 
1641 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1642 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1643                               VPSlotTracker &SlotTracker) const {
1644   O << Indent;
1645   printAsOperand(O, SlotTracker);
1646   O << " = DERIVED-IV ";
1647   getStartValue()->printAsOperand(O, SlotTracker);
1648   O << " + ";
1649   getOperand(1)->printAsOperand(O, SlotTracker);
1650   O << " * ";
1651   getStepValue()->printAsOperand(O, SlotTracker);
1652 }
1653 #endif
1654 
1655 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1656   // Fast-math-flags propagate from the original induction instruction.
1657   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1658   if (hasFastMathFlags())
1659     State.Builder.setFastMathFlags(getFastMathFlags());
1660 
1661   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1662   /// variable on which to base the steps, \p Step is the size of the step.
1663 
1664   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1665   Value *Step = State.get(getStepValue(), VPLane(0));
1666   IRBuilderBase &Builder = State.Builder;
1667 
1668   // Ensure step has the same type as that of scalar IV.
1669   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1670   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1671 
1672   // We build scalar steps for both integer and floating-point induction
1673   // variables. Here, we determine the kind of arithmetic we will perform.
1674   Instruction::BinaryOps AddOp;
1675   Instruction::BinaryOps MulOp;
1676   if (BaseIVTy->isIntegerTy()) {
1677     AddOp = Instruction::Add;
1678     MulOp = Instruction::Mul;
1679   } else {
1680     AddOp = InductionOpcode;
1681     MulOp = Instruction::FMul;
1682   }
1683 
1684   // Determine the number of scalars we need to generate for each unroll
1685   // iteration.
1686   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1687   // Compute the scalar steps and save the results in State.
1688   Type *IntStepTy =
1689       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1690   Type *VecIVTy = nullptr;
1691   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1692   if (!FirstLaneOnly && State.VF.isScalable()) {
1693     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1694     UnitStepVec =
1695         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1696     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1697     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1698   }
1699 
1700   unsigned StartLane = 0;
1701   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1702   if (State.Lane) {
1703     StartLane = State.Lane->getKnownLane();
1704     EndLane = StartLane + 1;
1705   }
1706   Value *StartIdx0 =
1707       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1708 
1709   if (!FirstLaneOnly && State.VF.isScalable()) {
1710     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1711     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1712     if (BaseIVTy->isFloatingPointTy())
1713       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1714     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1715     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1716     State.set(this, Add);
1717     // It's useful to record the lane values too for the known minimum number
1718     // of elements so we do those below. This improves the code quality when
1719     // trying to extract the first element, for example.
1720   }
1721 
1722   if (BaseIVTy->isFloatingPointTy())
1723     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1724 
1725   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1726     Value *StartIdx = Builder.CreateBinOp(
1727         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1728     // The step returned by `createStepForVF` is a runtime-evaluated value
1729     // when VF is scalable. Otherwise, it should be folded into a Constant.
1730     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1731            "Expected StartIdx to be folded to a constant when VF is not "
1732            "scalable");
1733     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1734     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1735     State.set(this, Add, VPLane(Lane));
1736   }
1737 }
1738 
1739 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1740 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1741                                   VPSlotTracker &SlotTracker) const {
1742   O << Indent;
1743   printAsOperand(O, SlotTracker);
1744   O << " = SCALAR-STEPS ";
1745   printOperands(O, SlotTracker);
1746 }
1747 #endif
1748 
1749 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1750   assert(State.VF.isVector() && "not widening");
1751   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1752   // Construct a vector GEP by widening the operands of the scalar GEP as
1753   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1754   // results in a vector of pointers when at least one operand of the GEP
1755   // is vector-typed. Thus, to keep the representation compact, we only use
1756   // vector-typed operands for loop-varying values.
1757 
1758   if (areAllOperandsInvariant()) {
1759     // If we are vectorizing, but the GEP has only loop-invariant operands,
1760     // the GEP we build (by only using vector-typed operands for
1761     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1762     // produce a vector of pointers, we need to either arbitrarily pick an
1763     // operand to broadcast, or broadcast a clone of the original GEP.
1764     // Here, we broadcast a clone of the original.
1765     //
1766     // TODO: If at some point we decide to scalarize instructions having
1767     //       loop-invariant operands, this special case will no longer be
1768     //       required. We would add the scalarization decision to
1769     //       collectLoopScalars() and teach getVectorValue() to broadcast
1770     //       the lane-zero scalar value.
1771     SmallVector<Value *> Ops;
1772     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1773       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1774 
1775     auto *NewGEP =
1776         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1777                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1778     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1779     State.set(this, Splat);
1780     State.addMetadata(Splat, GEP);
1781   } else {
1782     // If the GEP has at least one loop-varying operand, we are sure to
1783     // produce a vector of pointers unless VF is scalar.
1784     // The pointer operand of the new GEP. If it's loop-invariant, we
1785     // won't broadcast it.
1786     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
1787                                          : State.get(getOperand(0));
1788 
1789     // Collect all the indices for the new GEP. If any index is
1790     // loop-invariant, we won't broadcast it.
1791     SmallVector<Value *, 4> Indices;
1792     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1793       VPValue *Operand = getOperand(I);
1794       if (isIndexLoopInvariant(I - 1))
1795         Indices.push_back(State.get(Operand, VPLane(0)));
1796       else
1797         Indices.push_back(State.get(Operand));
1798     }
1799 
1800     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1801     // but it should be a vector, otherwise.
1802     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1803                                            Indices, "", isInBounds());
1804     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1805            "NewGEP is not a pointer vector");
1806     State.set(this, NewGEP);
1807     State.addMetadata(NewGEP, GEP);
1808   }
1809 }
1810 
1811 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1812 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1813                              VPSlotTracker &SlotTracker) const {
1814   O << Indent << "WIDEN-GEP ";
1815   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1816   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1817     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1818 
1819   O << " ";
1820   printAsOperand(O, SlotTracker);
1821   O << " = getelementptr";
1822   printFlags(O);
1823   printOperands(O, SlotTracker);
1824 }
1825 #endif
1826 
1827 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1828   auto &Builder = State.Builder;
1829   State.setDebugLocFrom(getDebugLoc());
1830   unsigned CurrentPart = getUnrollPart(*this);
1831   // Use i32 for the gep index type when the value is constant,
1832   // or query DataLayout for a more suitable index type otherwise.
1833   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
1834   Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0)
1835                       ? DL.getIndexType(Builder.getPtrTy(0))
1836                       : Builder.getInt32Ty();
1837   Value *Ptr = State.get(getOperand(0), VPLane(0));
1838   bool InBounds = isInBounds();
1839 
1840   Value *ResultPtr = nullptr;
1841   if (IsReverse) {
1842     // If the address is consecutive but reversed, then the
1843     // wide store needs to start at the last vector element.
1844     // RunTimeVF =  VScale * VF.getKnownMinValue()
1845     // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1846     Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1847     // NumElt = -CurrentPart * RunTimeVF
1848     Value *NumElt = Builder.CreateMul(
1849         ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
1850     // LastLane = 1 - RunTimeVF
1851     Value *LastLane =
1852         Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1853     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1854     ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds);
1855   } else {
1856     Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
1857     ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1858   }
1859 
1860   State.set(this, ResultPtr, /*IsScalar*/ true);
1861 }
1862 
1863 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1864 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1865                                   VPSlotTracker &SlotTracker) const {
1866   O << Indent;
1867   printAsOperand(O, SlotTracker);
1868   O << " = vector-pointer ";
1869   if (IsReverse)
1870     O << "(reverse) ";
1871 
1872   printOperands(O, SlotTracker);
1873 }
1874 #endif
1875 
1876 void VPBlendRecipe::execute(VPTransformState &State) {
1877   assert(isNormalized() && "Expected blend to be normalized!");
1878   State.setDebugLocFrom(getDebugLoc());
1879   // We know that all PHIs in non-header blocks are converted into
1880   // selects, so we don't have to worry about the insertion order and we
1881   // can just use the builder.
1882   // At this point we generate the predication tree. There may be
1883   // duplications since this is a simple recursive scan, but future
1884   // optimizations will clean it up.
1885 
1886   unsigned NumIncoming = getNumIncomingValues();
1887 
1888   // Generate a sequence of selects of the form:
1889   // SELECT(Mask3, In3,
1890   //        SELECT(Mask2, In2,
1891   //               SELECT(Mask1, In1,
1892   //                      In0)))
1893   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1894   // are essentially undef are taken from In0.
1895   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1896   Value *Result = nullptr;
1897   for (unsigned In = 0; In < NumIncoming; ++In) {
1898     // We might have single edge PHIs (blocks) - use an identity
1899     // 'select' for the first PHI operand.
1900     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
1901     if (In == 0)
1902       Result = In0; // Initialize with the first incoming value.
1903     else {
1904       // Select between the current value and the previous incoming edge
1905       // based on the incoming mask.
1906       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
1907       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
1908     }
1909   }
1910   State.set(this, Result, OnlyFirstLaneUsed);
1911 }
1912 
1913 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1914 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1915                           VPSlotTracker &SlotTracker) const {
1916   O << Indent << "BLEND ";
1917   printAsOperand(O, SlotTracker);
1918   O << " =";
1919   if (getNumIncomingValues() == 1) {
1920     // Not a User of any mask: not really blending, this is a
1921     // single-predecessor phi.
1922     O << " ";
1923     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1924   } else {
1925     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1926       O << " ";
1927       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1928       if (I == 0)
1929         continue;
1930       O << "/";
1931       getMask(I)->printAsOperand(O, SlotTracker);
1932     }
1933   }
1934 }
1935 #endif
1936 
1937 void VPReductionRecipe::execute(VPTransformState &State) {
1938   assert(!State.Lane && "Reduction being replicated.");
1939   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1940   RecurKind Kind = RdxDesc.getRecurrenceKind();
1941   // Propagate the fast-math flags carried by the underlying instruction.
1942   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1943   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1944   Value *NewVecOp = State.get(getVecOp());
1945   if (VPValue *Cond = getCondOp()) {
1946     Value *NewCond = State.get(Cond, State.VF.isScalar());
1947     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1948     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1949 
1950     Value *Start;
1951     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
1952       Start = RdxDesc.getRecurrenceStartValue();
1953     else
1954       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
1955                                           RdxDesc.getFastMathFlags());
1956     if (State.VF.isVector())
1957       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
1958 
1959     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
1960     NewVecOp = Select;
1961   }
1962   Value *NewRed;
1963   Value *NextInChain;
1964   if (IsOrdered) {
1965     if (State.VF.isVector())
1966       NewRed =
1967           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
1968     else
1969       NewRed = State.Builder.CreateBinOp(
1970           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1971           NewVecOp);
1972     PrevInChain = NewRed;
1973     NextInChain = NewRed;
1974   } else {
1975     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
1976     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
1977     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1978       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1979                                    NewRed, PrevInChain);
1980     else
1981       NextInChain = State.Builder.CreateBinOp(
1982           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1983   }
1984   State.set(this, NextInChain, /*IsScalar*/ true);
1985 }
1986 
1987 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1988   assert(!State.Lane && "Reduction being replicated.");
1989 
1990   auto &Builder = State.Builder;
1991   // Propagate the fast-math flags carried by the underlying instruction.
1992   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1993   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1994   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1995 
1996   RecurKind Kind = RdxDesc.getRecurrenceKind();
1997   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
1998   Value *VecOp = State.get(getVecOp());
1999   Value *EVL = State.get(getEVL(), VPLane(0));
2000 
2001   VectorBuilder VBuilder(Builder);
2002   VBuilder.setEVL(EVL);
2003   Value *Mask;
2004   // TODO: move the all-true mask generation into VectorBuilder.
2005   if (VPValue *CondOp = getCondOp())
2006     Mask = State.get(CondOp);
2007   else
2008     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2009   VBuilder.setMask(Mask);
2010 
2011   Value *NewRed;
2012   if (isOrdered()) {
2013     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2014   } else {
2015     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2016     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2017       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2018     else
2019       NewRed = Builder.CreateBinOp(
2020           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
2021   }
2022   State.set(this, NewRed, /*IsScalar*/ true);
2023 }
2024 
2025 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2026 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2027                               VPSlotTracker &SlotTracker) const {
2028   O << Indent << "REDUCE ";
2029   printAsOperand(O, SlotTracker);
2030   O << " = ";
2031   getChainOp()->printAsOperand(O, SlotTracker);
2032   O << " +";
2033   if (isa<FPMathOperator>(getUnderlyingInstr()))
2034     O << getUnderlyingInstr()->getFastMathFlags();
2035   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2036   getVecOp()->printAsOperand(O, SlotTracker);
2037   if (isConditional()) {
2038     O << ", ";
2039     getCondOp()->printAsOperand(O, SlotTracker);
2040   }
2041   O << ")";
2042   if (RdxDesc.IntermediateStore)
2043     O << " (with final reduction value stored in invariant address sank "
2044          "outside of loop)";
2045 }
2046 
2047 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2048                                  VPSlotTracker &SlotTracker) const {
2049   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2050   O << Indent << "REDUCE ";
2051   printAsOperand(O, SlotTracker);
2052   O << " = ";
2053   getChainOp()->printAsOperand(O, SlotTracker);
2054   O << " +";
2055   if (isa<FPMathOperator>(getUnderlyingInstr()))
2056     O << getUnderlyingInstr()->getFastMathFlags();
2057   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2058   getVecOp()->printAsOperand(O, SlotTracker);
2059   O << ", ";
2060   getEVL()->printAsOperand(O, SlotTracker);
2061   if (isConditional()) {
2062     O << ", ";
2063     getCondOp()->printAsOperand(O, SlotTracker);
2064   }
2065   O << ")";
2066   if (RdxDesc.IntermediateStore)
2067     O << " (with final reduction value stored in invariant address sank "
2068          "outside of loop)";
2069 }
2070 #endif
2071 
2072 bool VPReplicateRecipe::shouldPack() const {
2073   // Find if the recipe is used by a widened recipe via an intervening
2074   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2075   return any_of(users(), [](const VPUser *U) {
2076     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2077       return any_of(PredR->users(), [PredR](const VPUser *U) {
2078         return !U->usesScalars(PredR);
2079       });
2080     return false;
2081   });
2082 }
2083 
2084 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2085 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2086                               VPSlotTracker &SlotTracker) const {
2087   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2088 
2089   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2090     printAsOperand(O, SlotTracker);
2091     O << " = ";
2092   }
2093   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2094     O << "call";
2095     printFlags(O);
2096     O << "@" << CB->getCalledFunction()->getName() << "(";
2097     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2098                     O, [&O, &SlotTracker](VPValue *Op) {
2099                       Op->printAsOperand(O, SlotTracker);
2100                     });
2101     O << ")";
2102   } else {
2103     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2104     printFlags(O);
2105     printOperands(O, SlotTracker);
2106   }
2107 
2108   if (shouldPack())
2109     O << " (S->V)";
2110 }
2111 #endif
2112 
2113 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2114   assert(vputils::onlyFirstLaneUsed(this) &&
2115          "Codegen only implemented for first lane.");
2116   switch (Opcode) {
2117   case Instruction::SExt:
2118   case Instruction::ZExt:
2119   case Instruction::Trunc: {
2120     // Note: SExt/ZExt not used yet.
2121     Value *Op = State.get(getOperand(0), VPLane(0));
2122     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2123   }
2124   default:
2125     llvm_unreachable("opcode not implemented yet");
2126   }
2127 }
2128 
2129 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2130   State.set(this, generate(State), VPLane(0));
2131 }
2132 
2133 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2134 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2135                                 VPSlotTracker &SlotTracker) const {
2136   O << Indent << "SCALAR-CAST ";
2137   printAsOperand(O, SlotTracker);
2138   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2139   printOperands(O, SlotTracker);
2140   O << " to " << *ResultTy;
2141 }
2142 #endif
2143 
2144 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2145   assert(State.Lane && "Branch on Mask works only on single instance.");
2146 
2147   unsigned Lane = State.Lane->getKnownLane();
2148 
2149   Value *ConditionBit = nullptr;
2150   VPValue *BlockInMask = getMask();
2151   if (BlockInMask) {
2152     ConditionBit = State.get(BlockInMask);
2153     if (ConditionBit->getType()->isVectorTy())
2154       ConditionBit = State.Builder.CreateExtractElement(
2155           ConditionBit, State.Builder.getInt32(Lane));
2156   } else // Block in mask is all-one.
2157     ConditionBit = State.Builder.getTrue();
2158 
2159   // Replace the temporary unreachable terminator with a new conditional branch,
2160   // whose two destinations will be set later when they are created.
2161   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2162   assert(isa<UnreachableInst>(CurrentTerminator) &&
2163          "Expected to replace unreachable terminator with conditional branch.");
2164   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2165   CondBr->setSuccessor(0, nullptr);
2166   ReplaceInstWithInst(CurrentTerminator, CondBr);
2167 }
2168 
2169 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2170   assert(State.Lane && "Predicated instruction PHI works per instance.");
2171   Instruction *ScalarPredInst =
2172       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2173   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2174   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2175   assert(PredicatingBB && "Predicated block has no single predecessor.");
2176   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2177          "operand must be VPReplicateRecipe");
2178 
2179   // By current pack/unpack logic we need to generate only a single phi node: if
2180   // a vector value for the predicated instruction exists at this point it means
2181   // the instruction has vector users only, and a phi for the vector value is
2182   // needed. In this case the recipe of the predicated instruction is marked to
2183   // also do that packing, thereby "hoisting" the insert-element sequence.
2184   // Otherwise, a phi node for the scalar value is needed.
2185   if (State.hasVectorValue(getOperand(0))) {
2186     Value *VectorValue = State.get(getOperand(0));
2187     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2188     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2189     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2190     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2191     if (State.hasVectorValue(this))
2192       State.reset(this, VPhi);
2193     else
2194       State.set(this, VPhi);
2195     // NOTE: Currently we need to update the value of the operand, so the next
2196     // predicated iteration inserts its generated value in the correct vector.
2197     State.reset(getOperand(0), VPhi);
2198   } else {
2199     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2200     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2201     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2202                      PredicatingBB);
2203     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2204     if (State.hasScalarValue(this, *State.Lane))
2205       State.reset(this, Phi, *State.Lane);
2206     else
2207       State.set(this, Phi, *State.Lane);
2208     // NOTE: Currently we need to update the value of the operand, so the next
2209     // predicated iteration inserts its generated value in the correct vector.
2210     State.reset(getOperand(0), Phi, *State.Lane);
2211   }
2212 }
2213 
2214 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2215 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2216                                 VPSlotTracker &SlotTracker) const {
2217   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2218   printAsOperand(O, SlotTracker);
2219   O << " = ";
2220   printOperands(O, SlotTracker);
2221 }
2222 #endif
2223 
2224 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2225                                                  VPCostContext &Ctx) const {
2226   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2227   const Align Alignment =
2228       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2229   unsigned AS =
2230       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2231   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2232 
2233   if (!Consecutive) {
2234     // TODO: Using the original IR may not be accurate.
2235     // Currently, ARM will use the underlying IR to calculate gather/scatter
2236     // instruction cost.
2237     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2238     assert(!Reverse &&
2239            "Inconsecutive memory access should not have the order.");
2240     return Ctx.TTI.getAddressComputationCost(Ty) +
2241            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2242                                           IsMasked, Alignment, CostKind,
2243                                           &Ingredient);
2244   }
2245 
2246   InstructionCost Cost = 0;
2247   if (IsMasked) {
2248     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2249                                           AS, CostKind);
2250   } else {
2251     TTI::OperandValueInfo OpInfo =
2252         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2253     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2254                                     CostKind, OpInfo, &Ingredient);
2255   }
2256   if (!Reverse)
2257     return Cost;
2258 
2259   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2260                                         cast<VectorType>(Ty), {}, CostKind, 0);
2261 }
2262 
2263 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2264   auto *LI = cast<LoadInst>(&Ingredient);
2265 
2266   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2267   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2268   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2269   bool CreateGather = !isConsecutive();
2270 
2271   auto &Builder = State.Builder;
2272   State.setDebugLocFrom(getDebugLoc());
2273   Value *Mask = nullptr;
2274   if (auto *VPMask = getMask()) {
2275     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2276     // of a null all-one mask is a null mask.
2277     Mask = State.get(VPMask);
2278     if (isReverse())
2279       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2280   }
2281 
2282   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2283   Value *NewLI;
2284   if (CreateGather) {
2285     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2286                                        "wide.masked.gather");
2287   } else if (Mask) {
2288     NewLI =
2289         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2290                                  PoisonValue::get(DataTy), "wide.masked.load");
2291   } else {
2292     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2293   }
2294   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2295   State.addMetadata(NewLI, LI);
2296   if (Reverse)
2297     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2298   State.set(this, NewLI);
2299 }
2300 
2301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2302 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2303                               VPSlotTracker &SlotTracker) const {
2304   O << Indent << "WIDEN ";
2305   printAsOperand(O, SlotTracker);
2306   O << " = load ";
2307   printOperands(O, SlotTracker);
2308 }
2309 #endif
2310 
2311 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2312 /// dependence w/o affecting the result.
2313 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2314                                      Value *EVL, const Twine &Name) {
2315   VectorType *ValTy = cast<VectorType>(Operand->getType());
2316   Value *AllTrueMask =
2317       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2318   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2319                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2320 }
2321 
2322 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2323   auto *LI = cast<LoadInst>(&Ingredient);
2324 
2325   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2326   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2327   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2328   bool CreateGather = !isConsecutive();
2329 
2330   auto &Builder = State.Builder;
2331   State.setDebugLocFrom(getDebugLoc());
2332   CallInst *NewLI;
2333   Value *EVL = State.get(getEVL(), VPLane(0));
2334   Value *Addr = State.get(getAddr(), !CreateGather);
2335   Value *Mask = nullptr;
2336   if (VPValue *VPMask = getMask()) {
2337     Mask = State.get(VPMask);
2338     if (isReverse())
2339       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2340   } else {
2341     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2342   }
2343 
2344   if (CreateGather) {
2345     NewLI =
2346         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2347                                 nullptr, "wide.masked.gather");
2348   } else {
2349     VectorBuilder VBuilder(Builder);
2350     VBuilder.setEVL(EVL).setMask(Mask);
2351     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2352         Instruction::Load, DataTy, Addr, "vp.op.load"));
2353   }
2354   NewLI->addParamAttr(
2355       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2356   State.addMetadata(NewLI, LI);
2357   Instruction *Res = NewLI;
2358   if (isReverse())
2359     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2360   State.set(this, Res);
2361 }
2362 
2363 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2364                                                   VPCostContext &Ctx) const {
2365   if (!Consecutive || IsMasked)
2366     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2367 
2368   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2369   // here because the EVL recipes using EVL to replace the tail mask. But in the
2370   // legacy model, it will always calculate the cost of mask.
2371   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2372   // don't need to compare to the legacy cost model.
2373   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2374   const Align Alignment =
2375       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2376   unsigned AS =
2377       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2378   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2379   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2380       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2381   if (!Reverse)
2382     return Cost;
2383 
2384   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2385                                        cast<VectorType>(Ty), {}, CostKind, 0);
2386 }
2387 
2388 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2389 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2390                                  VPSlotTracker &SlotTracker) const {
2391   O << Indent << "WIDEN ";
2392   printAsOperand(O, SlotTracker);
2393   O << " = vp.load ";
2394   printOperands(O, SlotTracker);
2395 }
2396 #endif
2397 
2398 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2399   auto *SI = cast<StoreInst>(&Ingredient);
2400 
2401   VPValue *StoredVPValue = getStoredValue();
2402   bool CreateScatter = !isConsecutive();
2403   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2404 
2405   auto &Builder = State.Builder;
2406   State.setDebugLocFrom(getDebugLoc());
2407 
2408   Value *Mask = nullptr;
2409   if (auto *VPMask = getMask()) {
2410     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2411     // of a null all-one mask is a null mask.
2412     Mask = State.get(VPMask);
2413     if (isReverse())
2414       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2415   }
2416 
2417   Value *StoredVal = State.get(StoredVPValue);
2418   if (isReverse()) {
2419     // If we store to reverse consecutive memory locations, then we need
2420     // to reverse the order of elements in the stored value.
2421     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2422     // We don't want to update the value in the map as it might be used in
2423     // another expression. So don't call resetVectorValue(StoredVal).
2424   }
2425   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2426   Instruction *NewSI = nullptr;
2427   if (CreateScatter)
2428     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2429   else if (Mask)
2430     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2431   else
2432     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2433   State.addMetadata(NewSI, SI);
2434 }
2435 
2436 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2437 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2438                                VPSlotTracker &SlotTracker) const {
2439   O << Indent << "WIDEN store ";
2440   printOperands(O, SlotTracker);
2441 }
2442 #endif
2443 
2444 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2445   auto *SI = cast<StoreInst>(&Ingredient);
2446 
2447   VPValue *StoredValue = getStoredValue();
2448   bool CreateScatter = !isConsecutive();
2449   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2450 
2451   auto &Builder = State.Builder;
2452   State.setDebugLocFrom(getDebugLoc());
2453 
2454   CallInst *NewSI = nullptr;
2455   Value *StoredVal = State.get(StoredValue);
2456   Value *EVL = State.get(getEVL(), VPLane(0));
2457   if (isReverse())
2458     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2459   Value *Mask = nullptr;
2460   if (VPValue *VPMask = getMask()) {
2461     Mask = State.get(VPMask);
2462     if (isReverse())
2463       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2464   } else {
2465     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2466   }
2467   Value *Addr = State.get(getAddr(), !CreateScatter);
2468   if (CreateScatter) {
2469     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2470                                     Intrinsic::vp_scatter,
2471                                     {StoredVal, Addr, Mask, EVL});
2472   } else {
2473     VectorBuilder VBuilder(Builder);
2474     VBuilder.setEVL(EVL).setMask(Mask);
2475     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2476         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2477         {StoredVal, Addr}));
2478   }
2479   NewSI->addParamAttr(
2480       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2481   State.addMetadata(NewSI, SI);
2482 }
2483 
2484 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2485                                                    VPCostContext &Ctx) const {
2486   if (!Consecutive || IsMasked)
2487     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2488 
2489   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2490   // here because the EVL recipes using EVL to replace the tail mask. But in the
2491   // legacy model, it will always calculate the cost of mask.
2492   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2493   // don't need to compare to the legacy cost model.
2494   Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2495   const Align Alignment =
2496       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2497   unsigned AS =
2498       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2499   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2500   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2501       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2502   if (!Reverse)
2503     return Cost;
2504 
2505   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2506                                        cast<VectorType>(Ty), {}, CostKind, 0);
2507 }
2508 
2509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2510 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2511                                   VPSlotTracker &SlotTracker) const {
2512   O << Indent << "WIDEN vp.store ";
2513   printOperands(O, SlotTracker);
2514 }
2515 #endif
2516 
2517 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2518                                      VectorType *DstVTy, const DataLayout &DL) {
2519   // Verify that V is a vector type with same number of elements as DstVTy.
2520   auto VF = DstVTy->getElementCount();
2521   auto *SrcVecTy = cast<VectorType>(V->getType());
2522   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2523   Type *SrcElemTy = SrcVecTy->getElementType();
2524   Type *DstElemTy = DstVTy->getElementType();
2525   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2526          "Vector elements must have same size");
2527 
2528   // Do a direct cast if element types are castable.
2529   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2530     return Builder.CreateBitOrPointerCast(V, DstVTy);
2531   }
2532   // V cannot be directly casted to desired vector type.
2533   // May happen when V is a floating point vector but DstVTy is a vector of
2534   // pointers or vice-versa. Handle this using a two-step bitcast using an
2535   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2536   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2537          "Only one type should be a pointer type");
2538   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2539          "Only one type should be a floating point type");
2540   Type *IntTy =
2541       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2542   auto *VecIntTy = VectorType::get(IntTy, VF);
2543   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2544   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2545 }
2546 
2547 /// Return a vector containing interleaved elements from multiple
2548 /// smaller input vectors.
2549 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2550                                 const Twine &Name) {
2551   unsigned Factor = Vals.size();
2552   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2553 
2554   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2555 #ifndef NDEBUG
2556   for (Value *Val : Vals)
2557     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2558 #endif
2559 
2560   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2561   // must use intrinsics to interleave.
2562   if (VecTy->isScalableTy()) {
2563     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2564     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2565                                    Vals,
2566                                    /*FMFSource=*/nullptr, Name);
2567   }
2568 
2569   // Fixed length. Start by concatenating all vectors into a wide vector.
2570   Value *WideVec = concatenateVectors(Builder, Vals);
2571 
2572   // Interleave the elements into the wide vector.
2573   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2574   return Builder.CreateShuffleVector(
2575       WideVec, createInterleaveMask(NumElts, Factor), Name);
2576 }
2577 
2578 // Try to vectorize the interleave group that \p Instr belongs to.
2579 //
2580 // E.g. Translate following interleaved load group (factor = 3):
2581 //   for (i = 0; i < N; i+=3) {
2582 //     R = Pic[i];             // Member of index 0
2583 //     G = Pic[i+1];           // Member of index 1
2584 //     B = Pic[i+2];           // Member of index 2
2585 //     ... // do something to R, G, B
2586 //   }
2587 // To:
2588 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2589 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2590 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2591 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2592 //
2593 // Or translate following interleaved store group (factor = 3):
2594 //   for (i = 0; i < N; i+=3) {
2595 //     ... do something to R, G, B
2596 //     Pic[i]   = R;           // Member of index 0
2597 //     Pic[i+1] = G;           // Member of index 1
2598 //     Pic[i+2] = B;           // Member of index 2
2599 //   }
2600 // To:
2601 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2602 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2603 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2604 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2605 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2606 void VPInterleaveRecipe::execute(VPTransformState &State) {
2607   assert(!State.Lane && "Interleave group being replicated.");
2608   const InterleaveGroup<Instruction> *Group = IG;
2609   Instruction *Instr = Group->getInsertPos();
2610 
2611   // Prepare for the vector type of the interleaved load/store.
2612   Type *ScalarTy = getLoadStoreType(Instr);
2613   unsigned InterleaveFactor = Group->getFactor();
2614   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2615 
2616   // Prepare for the new pointers.
2617   unsigned Index = Group->getIndex(Instr);
2618 
2619   // TODO: extend the masked interleaved-group support to reversed access.
2620   VPValue *BlockInMask = getMask();
2621   assert((!BlockInMask || !Group->isReverse()) &&
2622          "Reversed masked interleave-group not supported.");
2623 
2624   Value *Idx;
2625   // If the group is reverse, adjust the index to refer to the last vector lane
2626   // instead of the first. We adjust the index from the first vector lane,
2627   // rather than directly getting the pointer for lane VF - 1, because the
2628   // pointer operand of the interleaved access is supposed to be uniform.
2629   if (Group->isReverse()) {
2630     Value *RuntimeVF =
2631         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2632     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2633     Idx = State.Builder.CreateMul(Idx,
2634                                   State.Builder.getInt32(Group->getFactor()));
2635     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2636     Idx = State.Builder.CreateNeg(Idx);
2637   } else
2638     Idx = State.Builder.getInt32(-Index);
2639 
2640   VPValue *Addr = getAddr();
2641   Value *ResAddr = State.get(Addr, VPLane(0));
2642   if (auto *I = dyn_cast<Instruction>(ResAddr))
2643     State.setDebugLocFrom(I->getDebugLoc());
2644 
2645   // Notice current instruction could be any index. Need to adjust the address
2646   // to the member of index 0.
2647   //
2648   // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2649   //       b = A[i];       // Member of index 0
2650   // Current pointer is pointed to A[i+1], adjust it to A[i].
2651   //
2652   // E.g.  A[i+1] = a;     // Member of index 1
2653   //       A[i]   = b;     // Member of index 0
2654   //       A[i+2] = c;     // Member of index 2 (Current instruction)
2655   // Current pointer is pointed to A[i+2], adjust it to A[i].
2656 
2657   bool InBounds = false;
2658   if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2659     InBounds = gep->isInBounds();
2660   ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
2661 
2662   State.setDebugLocFrom(Instr->getDebugLoc());
2663   Value *PoisonVec = PoisonValue::get(VecTy);
2664 
2665   auto CreateGroupMask = [&BlockInMask, &State,
2666                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2667     if (State.VF.isScalable()) {
2668       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2669       assert(InterleaveFactor == 2 &&
2670              "Unsupported deinterleave factor for scalable vectors");
2671       auto *ResBlockInMask = State.get(BlockInMask);
2672       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2673       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2674                                      State.VF.getKnownMinValue() * 2, true);
2675       return State.Builder.CreateIntrinsic(
2676           MaskTy, Intrinsic::vector_interleave2, Ops,
2677           /*FMFSource=*/nullptr, "interleaved.mask");
2678     }
2679 
2680     if (!BlockInMask)
2681       return MaskForGaps;
2682 
2683     Value *ResBlockInMask = State.get(BlockInMask);
2684     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2685         ResBlockInMask,
2686         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2687         "interleaved.mask");
2688     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2689                                                    ShuffledMask, MaskForGaps)
2690                        : ShuffledMask;
2691   };
2692 
2693   const DataLayout &DL = Instr->getDataLayout();
2694   // Vectorize the interleaved load group.
2695   if (isa<LoadInst>(Instr)) {
2696     Value *MaskForGaps = nullptr;
2697     if (NeedsMaskForGaps) {
2698       MaskForGaps = createBitMaskForGaps(State.Builder,
2699                                          State.VF.getKnownMinValue(), *Group);
2700       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2701     }
2702 
2703     Instruction *NewLoad;
2704     if (BlockInMask || MaskForGaps) {
2705       Value *GroupMask = CreateGroupMask(MaskForGaps);
2706       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2707                                                Group->getAlign(), GroupMask,
2708                                                PoisonVec, "wide.masked.vec");
2709     } else
2710       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2711                                                 Group->getAlign(), "wide.vec");
2712     Group->addMetadata(NewLoad);
2713 
2714     ArrayRef<VPValue *> VPDefs = definedValues();
2715     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2716     if (VecTy->isScalableTy()) {
2717       assert(InterleaveFactor == 2 &&
2718              "Unsupported deinterleave factor for scalable vectors");
2719 
2720         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2721         // so must use intrinsics to deinterleave.
2722       Value *DI = State.Builder.CreateIntrinsic(
2723           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2724           /*FMFSource=*/nullptr, "strided.vec");
2725       unsigned J = 0;
2726       for (unsigned I = 0; I < InterleaveFactor; ++I) {
2727         Instruction *Member = Group->getMember(I);
2728 
2729         if (!Member)
2730           continue;
2731 
2732         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2733         // If this member has different type, cast the result type.
2734         if (Member->getType() != ScalarTy) {
2735           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2736           StridedVec =
2737               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2738         }
2739 
2740         if (Group->isReverse())
2741           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2742 
2743         State.set(VPDefs[J], StridedVec);
2744         ++J;
2745       }
2746 
2747       return;
2748     }
2749 
2750     // For each member in the group, shuffle out the appropriate data from the
2751     // wide loads.
2752     unsigned J = 0;
2753     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2754       Instruction *Member = Group->getMember(I);
2755 
2756       // Skip the gaps in the group.
2757       if (!Member)
2758         continue;
2759 
2760       auto StrideMask =
2761           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2762       Value *StridedVec =
2763           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
2764 
2765       // If this member has different type, cast the result type.
2766       if (Member->getType() != ScalarTy) {
2767         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2768         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2769         StridedVec =
2770             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2771       }
2772 
2773       if (Group->isReverse())
2774         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2775 
2776       State.set(VPDefs[J], StridedVec);
2777       ++J;
2778     }
2779     return;
2780   }
2781 
2782   // The sub vector type for current instruction.
2783   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2784 
2785   // Vectorize the interleaved store group.
2786   Value *MaskForGaps =
2787       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2788   assert((!MaskForGaps || !State.VF.isScalable()) &&
2789          "masking gaps for scalable vectors is not yet supported.");
2790   ArrayRef<VPValue *> StoredValues = getStoredValues();
2791   // Collect the stored vector from each member.
2792   SmallVector<Value *, 4> StoredVecs;
2793   unsigned StoredIdx = 0;
2794   for (unsigned i = 0; i < InterleaveFactor; i++) {
2795     assert((Group->getMember(i) || MaskForGaps) &&
2796            "Fail to get a member from an interleaved store group");
2797     Instruction *Member = Group->getMember(i);
2798 
2799     // Skip the gaps in the group.
2800     if (!Member) {
2801       Value *Undef = PoisonValue::get(SubVT);
2802       StoredVecs.push_back(Undef);
2803       continue;
2804     }
2805 
2806     Value *StoredVec = State.get(StoredValues[StoredIdx]);
2807     ++StoredIdx;
2808 
2809     if (Group->isReverse())
2810       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2811 
2812     // If this member has different type, cast it to a unified type.
2813 
2814     if (StoredVec->getType() != SubVT)
2815       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2816 
2817     StoredVecs.push_back(StoredVec);
2818   }
2819 
2820   // Interleave all the smaller vectors into one wider vector.
2821   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2822   Instruction *NewStoreInstr;
2823   if (BlockInMask || MaskForGaps) {
2824     Value *GroupMask = CreateGroupMask(MaskForGaps);
2825     NewStoreInstr = State.Builder.CreateMaskedStore(
2826         IVec, ResAddr, Group->getAlign(), GroupMask);
2827   } else
2828     NewStoreInstr =
2829         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
2830 
2831   Group->addMetadata(NewStoreInstr);
2832 }
2833 
2834 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2835 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2836                                VPSlotTracker &SlotTracker) const {
2837   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2838   IG->getInsertPos()->printAsOperand(O, false);
2839   O << ", ";
2840   getAddr()->printAsOperand(O, SlotTracker);
2841   VPValue *Mask = getMask();
2842   if (Mask) {
2843     O << ", ";
2844     Mask->printAsOperand(O, SlotTracker);
2845   }
2846 
2847   unsigned OpIdx = 0;
2848   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2849     if (!IG->getMember(i))
2850       continue;
2851     if (getNumStoreOperands() > 0) {
2852       O << "\n" << Indent << "  store ";
2853       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2854       O << " to index " << i;
2855     } else {
2856       O << "\n" << Indent << "  ";
2857       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2858       O << " = load from index " << i;
2859     }
2860     ++OpIdx;
2861   }
2862 }
2863 #endif
2864 
2865 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2866   Value *Start = getStartValue()->getLiveInIRValue();
2867   PHINode *Phi = PHINode::Create(Start->getType(), 2, "index");
2868   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2869 
2870   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2871   Phi->addIncoming(Start, VectorPH);
2872   Phi->setDebugLoc(getDebugLoc());
2873   State.set(this, Phi, /*IsScalar*/ true);
2874 }
2875 
2876 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2877 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2878                                    VPSlotTracker &SlotTracker) const {
2879   O << Indent << "EMIT ";
2880   printAsOperand(O, SlotTracker);
2881   O << " = CANONICAL-INDUCTION ";
2882   printOperands(O, SlotTracker);
2883 }
2884 #endif
2885 
2886 bool VPCanonicalIVPHIRecipe::isCanonical(
2887     InductionDescriptor::InductionKind Kind, VPValue *Start,
2888     VPValue *Step) const {
2889   // Must be an integer induction.
2890   if (Kind != InductionDescriptor::IK_IntInduction)
2891     return false;
2892   // Start must match the start value of this canonical induction.
2893   if (Start != getStartValue())
2894     return false;
2895 
2896   // If the step is defined by a recipe, it is not a ConstantInt.
2897   if (Step->getDefiningRecipe())
2898     return false;
2899 
2900   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2901   return StepC && StepC->isOne();
2902 }
2903 
2904 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2905   return IsScalarAfterVectorization &&
2906          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2907 }
2908 
2909 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
2910   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
2911          "Not a pointer induction according to InductionDescriptor!");
2912   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
2913          "Unexpected type.");
2914   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
2915          "Recipe should have been replaced");
2916 
2917   auto *IVR = getParent()->getPlan()->getCanonicalIV();
2918   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
2919   unsigned CurrentPart = getUnrollPart(*this);
2920 
2921   // Build a pointer phi
2922   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
2923   Type *ScStValueType = ScalarStartValue->getType();
2924 
2925   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2926   PHINode *NewPointerPhi = nullptr;
2927   if (CurrentPart == 0) {
2928     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
2929                                     CanonicalIV->getIterator());
2930     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
2931   } else {
2932     // The recipe has been unrolled. In that case, fetch the single pointer phi
2933     // shared among all unrolled parts of the recipe.
2934     auto *GEP =
2935         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
2936     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
2937   }
2938 
2939   // A pointer induction, performed by using a gep
2940   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
2941   Value *ScalarStepValue = State.get(getOperand(1), VPLane(0));
2942   Type *PhiType = IndDesc.getStep()->getType();
2943   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
2944   // Add induction update using an incorrect block temporarily. The phi node
2945   // will be fixed after VPlan execution. Note that at this point the latch
2946   // block cannot be used, as it does not exist yet.
2947   // TODO: Model increment value in VPlan, by turning the recipe into a
2948   // multi-def and a subclass of VPHeaderPHIRecipe.
2949   if (CurrentPart == 0) {
2950     // The recipe represents the first part of the pointer induction. Create the
2951     // GEP to increment the phi across all unrolled parts.
2952     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
2953     Value *NumUnrolledElems =
2954         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
2955 
2956     Value *InductionGEP = GetElementPtrInst::Create(
2957         State.Builder.getInt8Ty(), NewPointerPhi,
2958         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
2959         InductionLoc);
2960 
2961     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
2962   }
2963 
2964   // Create actual address geps that use the pointer phi as base and a
2965   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
2966   Type *VecPhiType = VectorType::get(PhiType, State.VF);
2967   Value *StartOffsetScalar = State.Builder.CreateMul(
2968       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
2969   Value *StartOffset =
2970       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
2971   // Create a vector of consecutive numbers from zero to VF.
2972   StartOffset = State.Builder.CreateAdd(
2973       StartOffset, State.Builder.CreateStepVector(VecPhiType));
2974 
2975   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
2976          "scalar step must be the same across all parts");
2977   Value *GEP = State.Builder.CreateGEP(
2978       State.Builder.getInt8Ty(), NewPointerPhi,
2979       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
2980                                                State.VF, ScalarStepValue)),
2981       "vector.gep");
2982   State.set(this, GEP);
2983 }
2984 
2985 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2986 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2987                                           VPSlotTracker &SlotTracker) const {
2988   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
2989          "unexpected number of operands");
2990   O << Indent << "EMIT ";
2991   printAsOperand(O, SlotTracker);
2992   O << " = WIDEN-POINTER-INDUCTION ";
2993   getStartValue()->printAsOperand(O, SlotTracker);
2994   O << ", " << *IndDesc.getStep();
2995   if (getNumOperands() == 4) {
2996     O << ", ";
2997     getOperand(2)->printAsOperand(O, SlotTracker);
2998     O << ", ";
2999     getOperand(3)->printAsOperand(O, SlotTracker);
3000   }
3001 }
3002 #endif
3003 
3004 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3005   assert(!State.Lane && "cannot be used in per-lane");
3006   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3007   SCEVExpander Exp(SE, DL, "induction");
3008 
3009   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3010                                  &*State.Builder.GetInsertPoint());
3011   assert(!State.ExpandedSCEVs.contains(Expr) &&
3012          "Same SCEV expanded multiple times");
3013   State.ExpandedSCEVs[Expr] = Res;
3014   State.set(this, Res, VPLane(0));
3015 }
3016 
3017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3018 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3019                                VPSlotTracker &SlotTracker) const {
3020   O << Indent << "EMIT ";
3021   getVPSingleValue()->printAsOperand(O, SlotTracker);
3022   O << " = EXPAND SCEV " << *Expr;
3023 }
3024 #endif
3025 
3026 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3027   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3028   Type *STy = CanonicalIV->getType();
3029   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3030   ElementCount VF = State.VF;
3031   Value *VStart = VF.isScalar()
3032                       ? CanonicalIV
3033                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3034   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3035   if (VF.isVector()) {
3036     VStep = Builder.CreateVectorSplat(VF, VStep);
3037     VStep =
3038         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3039   }
3040   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3041   State.set(this, CanonicalVectorIV);
3042 }
3043 
3044 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3045 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3046                                      VPSlotTracker &SlotTracker) const {
3047   O << Indent << "EMIT ";
3048   printAsOperand(O, SlotTracker);
3049   O << " = WIDEN-CANONICAL-INDUCTION ";
3050   printOperands(O, SlotTracker);
3051 }
3052 #endif
3053 
3054 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3055   auto &Builder = State.Builder;
3056   // Create a vector from the initial value.
3057   auto *VectorInit = getStartValue()->getLiveInIRValue();
3058 
3059   Type *VecTy = State.VF.isScalar()
3060                     ? VectorInit->getType()
3061                     : VectorType::get(VectorInit->getType(), State.VF);
3062 
3063   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3064   if (State.VF.isVector()) {
3065     auto *IdxTy = Builder.getInt32Ty();
3066     auto *One = ConstantInt::get(IdxTy, 1);
3067     IRBuilder<>::InsertPointGuard Guard(Builder);
3068     Builder.SetInsertPoint(VectorPH->getTerminator());
3069     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3070     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3071     VectorInit = Builder.CreateInsertElement(
3072         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3073   }
3074 
3075   // Create a phi node for the new recurrence.
3076   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3077   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3078   Phi->addIncoming(VectorInit, VectorPH);
3079   State.set(this, Phi);
3080 }
3081 
3082 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3083 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3084                                             VPSlotTracker &SlotTracker) const {
3085   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3086   printAsOperand(O, SlotTracker);
3087   O << " = phi ";
3088   printOperands(O, SlotTracker);
3089 }
3090 #endif
3091 
3092 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3093   auto &Builder = State.Builder;
3094 
3095   // Reductions do not have to start at zero. They can start with
3096   // any loop invariant values.
3097   VPValue *StartVPV = getStartValue();
3098   Value *StartV = StartVPV->getLiveInIRValue();
3099 
3100   // In order to support recurrences we need to be able to vectorize Phi nodes.
3101   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3102   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3103   // this value when we vectorize all of the instructions that use the PHI.
3104   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3105   Type *VecTy = ScalarPHI ? StartV->getType()
3106                           : VectorType::get(StartV->getType(), State.VF);
3107 
3108   BasicBlock *HeaderBB = State.CFG.PrevBB;
3109   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3110          "recipe must be in the vector loop header");
3111   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3112   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3113   State.set(this, Phi, IsInLoop);
3114 
3115   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3116 
3117   Value *Iden = nullptr;
3118   RecurKind RK = RdxDesc.getRecurrenceKind();
3119   unsigned CurrentPart = getUnrollPart(*this);
3120 
3121   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3122       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3123     // MinMax and AnyOf reductions have the start value as their identity.
3124     if (ScalarPHI) {
3125       Iden = StartV;
3126     } else {
3127       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3128       Builder.SetInsertPoint(VectorPH->getTerminator());
3129       StartV = Iden = State.get(StartVPV);
3130     }
3131   } else {
3132     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3133                                        RdxDesc.getFastMathFlags());
3134 
3135     if (!ScalarPHI) {
3136       if (CurrentPart == 0) {
3137         // Create start and identity vector values for the reduction in the
3138         // preheader.
3139         // TODO: Introduce recipes in VPlan preheader to create initial values.
3140         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3141         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3142         Builder.SetInsertPoint(VectorPH->getTerminator());
3143         Constant *Zero = Builder.getInt32(0);
3144         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3145       } else {
3146         Iden = Builder.CreateVectorSplat(State.VF, Iden);
3147       }
3148     }
3149   }
3150 
3151   Phi = cast<PHINode>(State.get(this, IsInLoop));
3152   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3153   Phi->addIncoming(StartVal, VectorPH);
3154 }
3155 
3156 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3157 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3158                                  VPSlotTracker &SlotTracker) const {
3159   O << Indent << "WIDEN-REDUCTION-PHI ";
3160 
3161   printAsOperand(O, SlotTracker);
3162   O << " = phi ";
3163   printOperands(O, SlotTracker);
3164 }
3165 #endif
3166 
3167 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3168   assert(EnableVPlanNativePath &&
3169          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3170 
3171   Value *Op0 = State.get(getOperand(0));
3172   Type *VecTy = Op0->getType();
3173   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3174   State.set(this, VecPhi);
3175 }
3176 
3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3178 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3179                              VPSlotTracker &SlotTracker) const {
3180   O << Indent << "WIDEN-PHI ";
3181 
3182   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3183   // Unless all incoming values are modeled in VPlan  print the original PHI
3184   // directly.
3185   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3186   // values as VPValues.
3187   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3188     O << VPlanIngredient(OriginalPhi);
3189     return;
3190   }
3191 
3192   printAsOperand(O, SlotTracker);
3193   O << " = phi ";
3194   printOperands(O, SlotTracker);
3195 }
3196 #endif
3197 
3198 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3199 // remove VPActiveLaneMaskPHIRecipe.
3200 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3201   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3202   Value *StartMask = State.get(getOperand(0));
3203   PHINode *Phi =
3204       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3205   Phi->addIncoming(StartMask, VectorPH);
3206   Phi->setDebugLoc(getDebugLoc());
3207   State.set(this, Phi);
3208 }
3209 
3210 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3211 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3212                                       VPSlotTracker &SlotTracker) const {
3213   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3214 
3215   printAsOperand(O, SlotTracker);
3216   O << " = phi ";
3217   printOperands(O, SlotTracker);
3218 }
3219 #endif
3220 
3221 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
3222   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3223   Value *Start = State.get(getOperand(0), VPLane(0));
3224   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
3225   Phi->addIncoming(Start, VectorPH);
3226   Phi->setDebugLoc(getDebugLoc());
3227   State.set(this, Phi, /*IsScalar=*/true);
3228 }
3229 
3230 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3231 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3232                                   VPSlotTracker &SlotTracker) const {
3233   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3234 
3235   printAsOperand(O, SlotTracker);
3236   O << " = phi ";
3237   printOperands(O, SlotTracker);
3238 }
3239 #endif
3240