xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 713482fccf82d33c5c4ddb24538958617e1eb957)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "LoopVectorizationPlanner.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanPatternMatch.h"
18 #include "VPlanUtils.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/IVDescriptors.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Instruction.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/IR/Value.h"
30 #include "llvm/IR/VectorBuilder.h"
31 #include "llvm/Support/Casting.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/LoopUtils.h"
37 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
38 #include <cassert>
39 
40 using namespace llvm;
41 
42 using VectorParts = SmallVector<Value *, 2>;
43 
44 namespace llvm {
45 extern cl::opt<bool> EnableVPlanNativePath;
46 }
47 extern cl::opt<unsigned> ForceTargetInstructionCost;
48 
49 #define LV_NAME "loop-vectorize"
50 #define DEBUG_TYPE LV_NAME
51 
52 bool VPRecipeBase::mayWriteToMemory() const {
53   switch (getVPDefID()) {
54   case VPInstructionSC:
55     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56   case VPInterleaveSC:
57     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
58   case VPWidenStoreEVLSC:
59   case VPWidenStoreSC:
60     return true;
61   case VPReplicateSC:
62     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
63         ->mayWriteToMemory();
64   case VPWidenCallSC:
65     return !cast<VPWidenCallRecipe>(this)
66                 ->getCalledScalarFunction()
67                 ->onlyReadsMemory();
68   case VPWidenIntrinsicSC:
69     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
70   case VPBranchOnMaskSC:
71   case VPScalarIVStepsSC:
72   case VPPredInstPHISC:
73     return false;
74   case VPBlendSC:
75   case VPReductionEVLSC:
76   case VPReductionSC:
77   case VPVectorPointerSC:
78   case VPWidenCanonicalIVSC:
79   case VPWidenCastSC:
80   case VPWidenGEPSC:
81   case VPWidenIntOrFpInductionSC:
82   case VPWidenLoadEVLSC:
83   case VPWidenLoadSC:
84   case VPWidenPHISC:
85   case VPWidenSC:
86   case VPWidenEVLSC:
87   case VPWidenSelectSC: {
88     const Instruction *I =
89         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
90     (void)I;
91     assert((!I || !I->mayWriteToMemory()) &&
92            "underlying instruction may write to memory");
93     return false;
94   }
95   default:
96     return true;
97   }
98 }
99 
100 bool VPRecipeBase::mayReadFromMemory() const {
101   switch (getVPDefID()) {
102   case VPInstructionSC:
103     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
104   case VPWidenLoadEVLSC:
105   case VPWidenLoadSC:
106     return true;
107   case VPReplicateSC:
108     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
109         ->mayReadFromMemory();
110   case VPWidenCallSC:
111     return !cast<VPWidenCallRecipe>(this)
112                 ->getCalledScalarFunction()
113                 ->onlyWritesMemory();
114   case VPWidenIntrinsicSC:
115     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
116   case VPBranchOnMaskSC:
117   case VPPredInstPHISC:
118   case VPScalarIVStepsSC:
119   case VPWidenStoreEVLSC:
120   case VPWidenStoreSC:
121     return false;
122   case VPBlendSC:
123   case VPReductionEVLSC:
124   case VPReductionSC:
125   case VPVectorPointerSC:
126   case VPWidenCanonicalIVSC:
127   case VPWidenCastSC:
128   case VPWidenGEPSC:
129   case VPWidenIntOrFpInductionSC:
130   case VPWidenPHISC:
131   case VPWidenSC:
132   case VPWidenEVLSC:
133   case VPWidenSelectSC: {
134     const Instruction *I =
135         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
136     (void)I;
137     assert((!I || !I->mayReadFromMemory()) &&
138            "underlying instruction may read from memory");
139     return false;
140   }
141   default:
142     return true;
143   }
144 }
145 
146 bool VPRecipeBase::mayHaveSideEffects() const {
147   switch (getVPDefID()) {
148   case VPDerivedIVSC:
149   case VPPredInstPHISC:
150   case VPScalarCastSC:
151   case VPReverseVectorPointerSC:
152     return false;
153   case VPInstructionSC:
154     return mayWriteToMemory();
155   case VPWidenCallSC: {
156     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
157     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
158   }
159   case VPWidenIntrinsicSC:
160     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
161   case VPBlendSC:
162   case VPReductionEVLSC:
163   case VPReductionSC:
164   case VPScalarIVStepsSC:
165   case VPVectorPointerSC:
166   case VPWidenCanonicalIVSC:
167   case VPWidenCastSC:
168   case VPWidenGEPSC:
169   case VPWidenIntOrFpInductionSC:
170   case VPWidenPHISC:
171   case VPWidenPointerInductionSC:
172   case VPWidenSC:
173   case VPWidenEVLSC:
174   case VPWidenSelectSC: {
175     const Instruction *I =
176         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
177     (void)I;
178     assert((!I || !I->mayHaveSideEffects()) &&
179            "underlying instruction has side-effects");
180     return false;
181   }
182   case VPInterleaveSC:
183     return mayWriteToMemory();
184   case VPWidenLoadEVLSC:
185   case VPWidenLoadSC:
186   case VPWidenStoreEVLSC:
187   case VPWidenStoreSC:
188     assert(
189         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
190             mayWriteToMemory() &&
191         "mayHaveSideffects result for ingredient differs from this "
192         "implementation");
193     return mayWriteToMemory();
194   case VPReplicateSC: {
195     auto *R = cast<VPReplicateRecipe>(this);
196     return R->getUnderlyingInstr()->mayHaveSideEffects();
197   }
198   default:
199     return true;
200   }
201 }
202 
203 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
204   assert(!Parent && "Recipe already in some VPBasicBlock");
205   assert(InsertPos->getParent() &&
206          "Insertion position not in any VPBasicBlock");
207   InsertPos->getParent()->insert(this, InsertPos->getIterator());
208 }
209 
210 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
211                                 iplist<VPRecipeBase>::iterator I) {
212   assert(!Parent && "Recipe already in some VPBasicBlock");
213   assert(I == BB.end() || I->getParent() == &BB);
214   BB.insert(this, I);
215 }
216 
217 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
218   assert(!Parent && "Recipe already in some VPBasicBlock");
219   assert(InsertPos->getParent() &&
220          "Insertion position not in any VPBasicBlock");
221   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
222 }
223 
224 void VPRecipeBase::removeFromParent() {
225   assert(getParent() && "Recipe not in any VPBasicBlock");
226   getParent()->getRecipeList().remove(getIterator());
227   Parent = nullptr;
228 }
229 
230 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
231   assert(getParent() && "Recipe not in any VPBasicBlock");
232   return getParent()->getRecipeList().erase(getIterator());
233 }
234 
235 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
236   removeFromParent();
237   insertAfter(InsertPos);
238 }
239 
240 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
241                               iplist<VPRecipeBase>::iterator I) {
242   removeFromParent();
243   insertBefore(BB, I);
244 }
245 
246 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
247   // Get the underlying instruction for the recipe, if there is one. It is used
248   // to
249   //   * decide if cost computation should be skipped for this recipe,
250   //   * apply forced target instruction cost.
251   Instruction *UI = nullptr;
252   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
253     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
254   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
255     UI = IG->getInsertPos();
256   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
257     UI = &WidenMem->getIngredient();
258 
259   InstructionCost RecipeCost;
260   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
261     RecipeCost = 0;
262   } else {
263     RecipeCost = computeCost(VF, Ctx);
264     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
265         RecipeCost.isValid())
266       RecipeCost = InstructionCost(ForceTargetInstructionCost);
267   }
268 
269   LLVM_DEBUG({
270     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
271     dump();
272   });
273   return RecipeCost;
274 }
275 
276 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
277                                           VPCostContext &Ctx) const {
278   llvm_unreachable("subclasses should implement computeCost");
279 }
280 
281 InstructionCost
282 VPPartialReductionRecipe::computeCost(ElementCount VF,
283                                       VPCostContext &Ctx) const {
284   std::optional<unsigned> Opcode = std::nullopt;
285   VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
286   if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
287     Opcode = std::make_optional(WidenR->getOpcode());
288 
289   VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
290   VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
291 
292   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
293   auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
294                                                      : BinOpR->getOperand(0));
295   auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
296                                                      : BinOpR->getOperand(1));
297 
298   auto GetExtendKind = [](VPRecipeBase *R) {
299     // The extend could come from outside the plan.
300     if (!R)
301       return TargetTransformInfo::PR_None;
302     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
303     if (!WidenCastR)
304       return TargetTransformInfo::PR_None;
305     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
306       return TargetTransformInfo::PR_ZeroExtend;
307     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
308       return TargetTransformInfo::PR_SignExtend;
309     return TargetTransformInfo::PR_None;
310   };
311 
312   return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
313                                          PhiType, VF, GetExtendKind(ExtAR),
314                                          GetExtendKind(ExtBR), Opcode);
315 }
316 
317 void VPPartialReductionRecipe::execute(VPTransformState &State) {
318   State.setDebugLocFrom(getDebugLoc());
319   auto &Builder = State.Builder;
320 
321   assert(getOpcode() == Instruction::Add &&
322          "Unhandled partial reduction opcode");
323 
324   Value *BinOpVal = State.get(getOperand(0));
325   Value *PhiVal = State.get(getOperand(1));
326   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
327 
328   Type *RetTy = PhiVal->getType();
329 
330   CallInst *V = Builder.CreateIntrinsic(
331       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
332       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
333 
334   State.set(this, V);
335 }
336 
337 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
338 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
339                                      VPSlotTracker &SlotTracker) const {
340   O << Indent << "PARTIAL-REDUCE ";
341   printAsOperand(O, SlotTracker);
342   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
343   printOperands(O, SlotTracker);
344 }
345 #endif
346 
347 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
348   assert(OpType == OperationType::FPMathOp &&
349          "recipe doesn't have fast math flags");
350   FastMathFlags Res;
351   Res.setAllowReassoc(FMFs.AllowReassoc);
352   Res.setNoNaNs(FMFs.NoNaNs);
353   Res.setNoInfs(FMFs.NoInfs);
354   Res.setNoSignedZeros(FMFs.NoSignedZeros);
355   Res.setAllowReciprocal(FMFs.AllowReciprocal);
356   Res.setAllowContract(FMFs.AllowContract);
357   Res.setApproxFunc(FMFs.ApproxFunc);
358   return Res;
359 }
360 
361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
362 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
363 #endif
364 
365 template <unsigned PartOpIdx>
366 VPValue *
367 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
368   if (U.getNumOperands() == PartOpIdx + 1)
369     return U.getOperand(PartOpIdx);
370   return nullptr;
371 }
372 
373 template <unsigned PartOpIdx>
374 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
375   if (auto *UnrollPartOp = getUnrollPartOperand(U))
376     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
377   return 0;
378 }
379 
380 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
381                              VPValue *A, VPValue *B, DebugLoc DL,
382                              const Twine &Name)
383     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
384                           Pred, DL),
385       Opcode(Opcode), Name(Name.str()) {
386   assert(Opcode == Instruction::ICmp &&
387          "only ICmp predicates supported at the moment");
388 }
389 
390 VPInstruction::VPInstruction(unsigned Opcode,
391                              std::initializer_list<VPValue *> Operands,
392                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
393     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
394       Opcode(Opcode), Name(Name.str()) {
395   // Make sure the VPInstruction is a floating-point operation.
396   assert(isFPMathOp() && "this op can't take fast-math flags");
397 }
398 
399 bool VPInstruction::doesGeneratePerAllLanes() const {
400   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
401 }
402 
403 bool VPInstruction::canGenerateScalarForFirstLane() const {
404   if (Instruction::isBinaryOp(getOpcode()))
405     return true;
406   if (isSingleScalar() || isVectorToScalar())
407     return true;
408   switch (Opcode) {
409   case Instruction::ICmp:
410   case Instruction::Select:
411   case VPInstruction::BranchOnCond:
412   case VPInstruction::BranchOnCount:
413   case VPInstruction::CalculateTripCountMinusVF:
414   case VPInstruction::CanonicalIVIncrementForPart:
415   case VPInstruction::PtrAdd:
416   case VPInstruction::ExplicitVectorLength:
417   case VPInstruction::AnyOf:
418     return true;
419   default:
420     return false;
421   }
422 }
423 
424 Value *VPInstruction::generatePerLane(VPTransformState &State,
425                                       const VPLane &Lane) {
426   IRBuilderBase &Builder = State.Builder;
427 
428   assert(getOpcode() == VPInstruction::PtrAdd &&
429          "only PtrAdd opcodes are supported for now");
430   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
431                               State.get(getOperand(1), Lane), Name);
432 }
433 
434 Value *VPInstruction::generate(VPTransformState &State) {
435   IRBuilderBase &Builder = State.Builder;
436 
437   if (Instruction::isBinaryOp(getOpcode())) {
438     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
439     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
440     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
441     auto *Res =
442         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
443     if (auto *I = dyn_cast<Instruction>(Res))
444       setFlags(I);
445     return Res;
446   }
447 
448   switch (getOpcode()) {
449   case VPInstruction::Not: {
450     Value *A = State.get(getOperand(0));
451     return Builder.CreateNot(A, Name);
452   }
453   case Instruction::ICmp: {
454     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
455     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
456     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
457     return Builder.CreateCmp(getPredicate(), A, B, Name);
458   }
459   case Instruction::Select: {
460     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
461     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
462     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
463     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
464     return Builder.CreateSelect(Cond, Op1, Op2, Name);
465   }
466   case VPInstruction::ActiveLaneMask: {
467     // Get first lane of vector induction variable.
468     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
469     // Get the original loop tripcount.
470     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
471 
472     // If this part of the active lane mask is scalar, generate the CMP directly
473     // to avoid unnecessary extracts.
474     if (State.VF.isScalar())
475       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
476                                Name);
477 
478     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
479     auto *PredTy = VectorType::get(Int1Ty, State.VF);
480     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
481                                    {PredTy, ScalarTC->getType()},
482                                    {VIVElem0, ScalarTC}, nullptr, Name);
483   }
484   case VPInstruction::FirstOrderRecurrenceSplice: {
485     // Generate code to combine the previous and current values in vector v3.
486     //
487     //   vector.ph:
488     //     v_init = vector(..., ..., ..., a[-1])
489     //     br vector.body
490     //
491     //   vector.body
492     //     i = phi [0, vector.ph], [i+4, vector.body]
493     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
494     //     v2 = a[i, i+1, i+2, i+3];
495     //     v3 = vector(v1(3), v2(0, 1, 2))
496 
497     auto *V1 = State.get(getOperand(0));
498     if (!V1->getType()->isVectorTy())
499       return V1;
500     Value *V2 = State.get(getOperand(1));
501     return Builder.CreateVectorSplice(V1, V2, -1, Name);
502   }
503   case VPInstruction::CalculateTripCountMinusVF: {
504     unsigned UF = getParent()->getPlan()->getUF();
505     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
506     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
507     Value *Sub = Builder.CreateSub(ScalarTC, Step);
508     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
509     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
510     return Builder.CreateSelect(Cmp, Sub, Zero);
511   }
512   case VPInstruction::ExplicitVectorLength: {
513     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
514     // be outside of the main loop.
515     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
516     // Compute EVL
517     assert(AVL->getType()->isIntegerTy() &&
518            "Requested vector length should be an integer.");
519 
520     assert(State.VF.isScalable() && "Expected scalable vector factor.");
521     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
522 
523     Value *EVL = State.Builder.CreateIntrinsic(
524         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
525         {AVL, VFArg, State.Builder.getTrue()});
526     return EVL;
527   }
528   case VPInstruction::CanonicalIVIncrementForPart: {
529     unsigned Part = getUnrollPart(*this);
530     auto *IV = State.get(getOperand(0), VPLane(0));
531     assert(Part != 0 && "Must have a positive part");
532     // The canonical IV is incremented by the vectorization factor (num of
533     // SIMD elements) times the unroll part.
534     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
535     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
536                              hasNoSignedWrap());
537   }
538   case VPInstruction::BranchOnCond: {
539     Value *Cond = State.get(getOperand(0), VPLane(0));
540     // Replace the temporary unreachable terminator with a new conditional
541     // branch, hooking it up to backward destination for exiting blocks now and
542     // to forward destination(s) later when they are created.
543     BranchInst *CondBr =
544         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
545     CondBr->setSuccessor(0, nullptr);
546     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
547 
548     if (!getParent()->isExiting())
549       return CondBr;
550 
551     VPRegionBlock *ParentRegion = getParent()->getParent();
552     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
553     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
554     return CondBr;
555   }
556   case VPInstruction::BranchOnCount: {
557     // First create the compare.
558     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
559     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
560     Value *Cond = Builder.CreateICmpEQ(IV, TC);
561 
562     // Now create the branch.
563     auto *Plan = getParent()->getPlan();
564     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
565     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
566 
567     // Replace the temporary unreachable terminator with a new conditional
568     // branch, hooking it up to backward destination (the header) now and to the
569     // forward destination (the exit/middle block) later when it is created.
570     // Note that CreateCondBr expects a valid BB as first argument, so we need
571     // to set it to nullptr later.
572     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
573                                               State.CFG.VPBB2IRBB[Header]);
574     CondBr->setSuccessor(0, nullptr);
575     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
576     return CondBr;
577   }
578   case VPInstruction::ComputeReductionResult: {
579     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
580     // and will be removed by breaking up the recipe further.
581     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
582     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
583     // Get its reduction variable descriptor.
584     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
585 
586     RecurKind RK = RdxDesc.getRecurrenceKind();
587 
588     Type *PhiTy = OrigPhi->getType();
589     // The recipe's operands are the reduction phi, followed by one operand for
590     // each part of the reduction.
591     unsigned UF = getNumOperands() - 1;
592     VectorParts RdxParts(UF);
593     for (unsigned Part = 0; Part < UF; ++Part)
594       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
595 
596     // If the vector reduction can be performed in a smaller type, we truncate
597     // then extend the loop exit value to enable InstCombine to evaluate the
598     // entire expression in the smaller type.
599     // TODO: Handle this in truncateToMinBW.
600     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
601       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
602       for (unsigned Part = 0; Part < UF; ++Part)
603         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
604     }
605     // Reduce all of the unrolled parts into a single vector.
606     Value *ReducedPartRdx = RdxParts[0];
607     unsigned Op = RdxDesc.getOpcode();
608     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
609       Op = Instruction::Or;
610 
611     if (PhiR->isOrdered()) {
612       ReducedPartRdx = RdxParts[UF - 1];
613     } else {
614       // Floating-point operations should have some FMF to enable the reduction.
615       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
616       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
617       for (unsigned Part = 1; Part < UF; ++Part) {
618         Value *RdxPart = RdxParts[Part];
619         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
620           ReducedPartRdx = Builder.CreateBinOp(
621               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
622         else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
623           ReducedPartRdx =
624               createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
625         else
626           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
627       }
628     }
629 
630     // Create the reduction after the loop. Note that inloop reductions create
631     // the target reduction in the loop using a Reduction recipe.
632     if ((State.VF.isVector() ||
633          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
634          RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
635         !PhiR->isInLoop()) {
636       ReducedPartRdx =
637           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
638       // If the reduction can be performed in a smaller type, we need to extend
639       // the reduction to the wider type before we branch to the original loop.
640       if (PhiTy != RdxDesc.getRecurrenceType())
641         ReducedPartRdx = RdxDesc.isSigned()
642                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
643                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
644     }
645 
646     return ReducedPartRdx;
647   }
648   case VPInstruction::ExtractFromEnd: {
649     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
650     unsigned Offset = CI->getZExtValue();
651     assert(Offset > 0 && "Offset from end must be positive");
652     Value *Res;
653     if (State.VF.isVector()) {
654       assert(Offset <= State.VF.getKnownMinValue() &&
655              "invalid offset to extract from");
656       // Extract lane VF - Offset from the operand.
657       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
658     } else {
659       assert(Offset <= 1 && "invalid offset to extract from");
660       Res = State.get(getOperand(0));
661     }
662     if (isa<ExtractElementInst>(Res))
663       Res->setName(Name);
664     return Res;
665   }
666   case VPInstruction::LogicalAnd: {
667     Value *A = State.get(getOperand(0));
668     Value *B = State.get(getOperand(1));
669     return Builder.CreateLogicalAnd(A, B, Name);
670   }
671   case VPInstruction::PtrAdd: {
672     assert(vputils::onlyFirstLaneUsed(this) &&
673            "can only generate first lane for PtrAdd");
674     Value *Ptr = State.get(getOperand(0), VPLane(0));
675     Value *Addend = State.get(getOperand(1), VPLane(0));
676     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
677   }
678   case VPInstruction::ResumePhi: {
679     Value *IncomingFromVPlanPred =
680         State.get(getOperand(0), /* IsScalar */ true);
681     Value *IncomingFromOtherPreds =
682         State.get(getOperand(1), /* IsScalar */ true);
683     auto *NewPhi =
684         Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
685     BasicBlock *VPlanPred =
686         State.CFG
687             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
688     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
689     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
690       if (OtherPred == VPlanPred)
691         continue;
692       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
693     }
694     return NewPhi;
695   }
696   case VPInstruction::AnyOf: {
697     Value *A = State.get(getOperand(0));
698     return Builder.CreateOrReduce(A);
699   }
700 
701   default:
702     llvm_unreachable("Unsupported opcode for instruction");
703   }
704 }
705 
706 bool VPInstruction::isVectorToScalar() const {
707   return getOpcode() == VPInstruction::ExtractFromEnd ||
708          getOpcode() == VPInstruction::ComputeReductionResult ||
709          getOpcode() == VPInstruction::AnyOf;
710 }
711 
712 bool VPInstruction::isSingleScalar() const {
713   return getOpcode() == VPInstruction::ResumePhi;
714 }
715 
716 #if !defined(NDEBUG)
717 bool VPInstruction::isFPMathOp() const {
718   // Inspired by FPMathOperator::classof. Notable differences are that we don't
719   // support Call, PHI and Select opcodes here yet.
720   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
721          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
722          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
723          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
724 }
725 #endif
726 
727 void VPInstruction::execute(VPTransformState &State) {
728   assert(!State.Lane && "VPInstruction executing an Lane");
729   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
730   assert((hasFastMathFlags() == isFPMathOp() ||
731           getOpcode() == Instruction::Select) &&
732          "Recipe not a FPMathOp but has fast-math flags?");
733   if (hasFastMathFlags())
734     State.Builder.setFastMathFlags(getFastMathFlags());
735   State.setDebugLocFrom(getDebugLoc());
736   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
737                                    (vputils::onlyFirstLaneUsed(this) ||
738                                     isVectorToScalar() || isSingleScalar());
739   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
740   if (GeneratesPerAllLanes) {
741     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
742          Lane != NumLanes; ++Lane) {
743       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
744       assert(GeneratedValue && "generatePerLane must produce a value");
745       State.set(this, GeneratedValue, VPLane(Lane));
746     }
747     return;
748   }
749 
750   Value *GeneratedValue = generate(State);
751   if (!hasResult())
752     return;
753   assert(GeneratedValue && "generate must produce a value");
754   assert(
755       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
756        State.VF.isScalar()) &&
757       "scalar value but not only first lane defined");
758   State.set(this, GeneratedValue,
759             /*IsScalar*/ GeneratesPerFirstLaneOnly);
760 }
761 
762 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
763   if (Instruction::isBinaryOp(getOpcode()))
764     return false;
765   switch (getOpcode()) {
766   case Instruction::ICmp:
767   case Instruction::Select:
768   case VPInstruction::AnyOf:
769   case VPInstruction::CalculateTripCountMinusVF:
770   case VPInstruction::CanonicalIVIncrementForPart:
771   case VPInstruction::ExtractFromEnd:
772   case VPInstruction::FirstOrderRecurrenceSplice:
773   case VPInstruction::LogicalAnd:
774   case VPInstruction::Not:
775   case VPInstruction::PtrAdd:
776     return false;
777   default:
778     return true;
779   }
780 }
781 
782 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
783   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
784   if (Instruction::isBinaryOp(getOpcode()))
785     return vputils::onlyFirstLaneUsed(this);
786 
787   switch (getOpcode()) {
788   default:
789     return false;
790   case Instruction::ICmp:
791   case Instruction::Select:
792   case Instruction::Or:
793   case VPInstruction::PtrAdd:
794     // TODO: Cover additional opcodes.
795     return vputils::onlyFirstLaneUsed(this);
796   case VPInstruction::ActiveLaneMask:
797   case VPInstruction::ExplicitVectorLength:
798   case VPInstruction::CalculateTripCountMinusVF:
799   case VPInstruction::CanonicalIVIncrementForPart:
800   case VPInstruction::BranchOnCount:
801   case VPInstruction::BranchOnCond:
802   case VPInstruction::ResumePhi:
803     return true;
804   };
805   llvm_unreachable("switch should return");
806 }
807 
808 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
809   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
810   if (Instruction::isBinaryOp(getOpcode()))
811     return vputils::onlyFirstPartUsed(this);
812 
813   switch (getOpcode()) {
814   default:
815     return false;
816   case Instruction::ICmp:
817   case Instruction::Select:
818     return vputils::onlyFirstPartUsed(this);
819   case VPInstruction::BranchOnCount:
820   case VPInstruction::BranchOnCond:
821   case VPInstruction::CanonicalIVIncrementForPart:
822     return true;
823   };
824   llvm_unreachable("switch should return");
825 }
826 
827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
828 void VPInstruction::dump() const {
829   VPSlotTracker SlotTracker(getParent()->getPlan());
830   print(dbgs(), "", SlotTracker);
831 }
832 
833 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
834                           VPSlotTracker &SlotTracker) const {
835   O << Indent << "EMIT ";
836 
837   if (hasResult()) {
838     printAsOperand(O, SlotTracker);
839     O << " = ";
840   }
841 
842   switch (getOpcode()) {
843   case VPInstruction::Not:
844     O << "not";
845     break;
846   case VPInstruction::SLPLoad:
847     O << "combined load";
848     break;
849   case VPInstruction::SLPStore:
850     O << "combined store";
851     break;
852   case VPInstruction::ActiveLaneMask:
853     O << "active lane mask";
854     break;
855   case VPInstruction::ResumePhi:
856     O << "resume-phi";
857     break;
858   case VPInstruction::ExplicitVectorLength:
859     O << "EXPLICIT-VECTOR-LENGTH";
860     break;
861   case VPInstruction::FirstOrderRecurrenceSplice:
862     O << "first-order splice";
863     break;
864   case VPInstruction::BranchOnCond:
865     O << "branch-on-cond";
866     break;
867   case VPInstruction::CalculateTripCountMinusVF:
868     O << "TC > VF ? TC - VF : 0";
869     break;
870   case VPInstruction::CanonicalIVIncrementForPart:
871     O << "VF * Part +";
872     break;
873   case VPInstruction::BranchOnCount:
874     O << "branch-on-count";
875     break;
876   case VPInstruction::ExtractFromEnd:
877     O << "extract-from-end";
878     break;
879   case VPInstruction::ComputeReductionResult:
880     O << "compute-reduction-result";
881     break;
882   case VPInstruction::LogicalAnd:
883     O << "logical-and";
884     break;
885   case VPInstruction::PtrAdd:
886     O << "ptradd";
887     break;
888   case VPInstruction::AnyOf:
889     O << "any-of";
890     break;
891   default:
892     O << Instruction::getOpcodeName(getOpcode());
893   }
894 
895   printFlags(O);
896   printOperands(O, SlotTracker);
897 
898   if (auto DL = getDebugLoc()) {
899     O << ", !dbg ";
900     DL.print(O);
901   }
902 }
903 #endif
904 
905 void VPIRInstruction::execute(VPTransformState &State) {
906   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
907          "Only PHINodes can have extra operands");
908   for (const auto &[Idx, Op] : enumerate(operands())) {
909     VPValue *ExitValue = Op;
910     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
911                     ? VPLane::getFirstLane()
912                     : VPLane::getLastLaneForVF(State.VF);
913     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
914     auto *PredVPBB = Pred->getExitingBasicBlock();
915     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
916     // Set insertion point in PredBB in case an extract needs to be generated.
917     // TODO: Model extracts explicitly.
918     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
919     Value *V = State.get(ExitValue, VPLane(Lane));
920     auto *Phi = cast<PHINode>(&I);
921     // If there is no existing block for PredBB in the phi, add a new incoming
922     // value. Otherwise update the existing incoming value for PredBB.
923     if (Phi->getBasicBlockIndex(PredBB) == -1)
924       Phi->addIncoming(V, PredBB);
925     else
926       Phi->setIncomingValueForBlock(PredBB, V);
927   }
928 
929   // Advance the insert point after the wrapped IR instruction. This allows
930   // interleaving VPIRInstructions and other recipes.
931   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
932 }
933 
934 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
935                                              VPCostContext &Ctx) const {
936   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
937   // hence it does not contribute to the cost-modeling for the VPlan.
938   return 0;
939 }
940 
941 void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
942   assert(isa<PHINode>(getInstruction()) &&
943          "can only add exiting operands to phi nodes");
944   assert(getNumOperands() == 1 && "must have a single operand");
945   VPValue *Exiting = getOperand(0);
946   if (!Exiting->isLiveIn()) {
947     LLVMContext &Ctx = getInstruction().getContext();
948     auto &Plan = *getParent()->getPlan();
949     Exiting = Builder.createNaryOp(
950         VPInstruction::ExtractFromEnd,
951         {Exiting,
952          Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))});
953   }
954   setOperand(0, Exiting);
955 }
956 
957 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
958 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
959                             VPSlotTracker &SlotTracker) const {
960   O << Indent << "IR " << I;
961 
962   if (getNumOperands() != 0) {
963     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
964     interleaveComma(
965         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
966           Op.value()->printAsOperand(O, SlotTracker);
967           O << " from ";
968           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
969         });
970     O << ")";
971   }
972 }
973 #endif
974 
975 void VPWidenCallRecipe::execute(VPTransformState &State) {
976   assert(State.VF.isVector() && "not widening");
977   State.setDebugLocFrom(getDebugLoc());
978 
979   FunctionType *VFTy = Variant->getFunctionType();
980   // Add return type if intrinsic is overloaded on it.
981   SmallVector<Value *, 4> Args;
982   for (const auto &I : enumerate(arg_operands())) {
983     Value *Arg;
984     // Some vectorized function variants may also take a scalar argument,
985     // e.g. linear parameters for pointers. This needs to be the scalar value
986     // from the start of the respective part when interleaving.
987     if (!VFTy->getParamType(I.index())->isVectorTy())
988       Arg = State.get(I.value(), VPLane(0));
989     else
990       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
991     Args.push_back(Arg);
992   }
993 
994   assert(Variant != nullptr && "Can't create vector function.");
995 
996   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
997   SmallVector<OperandBundleDef, 1> OpBundles;
998   if (CI)
999     CI->getOperandBundlesAsDefs(OpBundles);
1000 
1001   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1002   setFlags(V);
1003 
1004   if (!V->getType()->isVoidTy())
1005     State.set(this, V);
1006   State.addMetadata(V, CI);
1007 }
1008 
1009 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1010                                                VPCostContext &Ctx) const {
1011   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1012                                   Variant->getFunctionType()->params(),
1013                                   Ctx.CostKind);
1014 }
1015 
1016 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1017 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1018                               VPSlotTracker &SlotTracker) const {
1019   O << Indent << "WIDEN-CALL ";
1020 
1021   Function *CalledFn = getCalledScalarFunction();
1022   if (CalledFn->getReturnType()->isVoidTy())
1023     O << "void ";
1024   else {
1025     printAsOperand(O, SlotTracker);
1026     O << " = ";
1027   }
1028 
1029   O << "call";
1030   printFlags(O);
1031   O << " @" << CalledFn->getName() << "(";
1032   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1033     Op->printAsOperand(O, SlotTracker);
1034   });
1035   O << ")";
1036 
1037   O << " (using library function";
1038   if (Variant->hasName())
1039     O << ": " << Variant->getName();
1040   O << ")";
1041 }
1042 #endif
1043 
1044 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1045   assert(State.VF.isVector() && "not widening");
1046   State.setDebugLocFrom(getDebugLoc());
1047 
1048   SmallVector<Type *, 2> TysForDecl;
1049   // Add return type if intrinsic is overloaded on it.
1050   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1051     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1052   SmallVector<Value *, 4> Args;
1053   for (const auto &I : enumerate(operands())) {
1054     // Some intrinsics have a scalar argument - don't replace it with a
1055     // vector.
1056     Value *Arg;
1057     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1058                                            State.TTI))
1059       Arg = State.get(I.value(), VPLane(0));
1060     else
1061       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1062     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1063                                                State.TTI))
1064       TysForDecl.push_back(Arg->getType());
1065     Args.push_back(Arg);
1066   }
1067 
1068   // Use vector version of the intrinsic.
1069   Module *M = State.Builder.GetInsertBlock()->getModule();
1070   Function *VectorF =
1071       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1072   assert(VectorF &&
1073          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1074 
1075   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1076   SmallVector<OperandBundleDef, 1> OpBundles;
1077   if (CI)
1078     CI->getOperandBundlesAsDefs(OpBundles);
1079 
1080   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1081 
1082   setFlags(V);
1083 
1084   if (!V->getType()->isVoidTy())
1085     State.set(this, V);
1086   State.addMetadata(V, CI);
1087 }
1088 
1089 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1090                                                     VPCostContext &Ctx) const {
1091   // Some backends analyze intrinsic arguments to determine cost. Use the
1092   // underlying value for the operand if it has one. Otherwise try to use the
1093   // operand of the underlying call instruction, if there is one. Otherwise
1094   // clear Arguments.
1095   // TODO: Rework TTI interface to be independent of concrete IR values.
1096   SmallVector<const Value *> Arguments;
1097   for (const auto &[Idx, Op] : enumerate(operands())) {
1098     auto *V = Op->getUnderlyingValue();
1099     if (!V) {
1100       // Push all the VP Intrinsic's ops into the Argments even if is nullptr.
1101       // Some VP Intrinsic's cost will assert the number of parameters.
1102       // Mainly appears in the following two scenarios:
1103       // 1. EVL Op is nullptr
1104       // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic
1105       if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
1106         Arguments.push_back(V);
1107         continue;
1108       }
1109       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1110         Arguments.push_back(UI->getArgOperand(Idx));
1111         continue;
1112       }
1113       Arguments.clear();
1114       break;
1115     }
1116     Arguments.push_back(V);
1117   }
1118 
1119   Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1120   SmallVector<Type *> ParamTys;
1121   for (unsigned I = 0; I != getNumOperands(); ++I)
1122     ParamTys.push_back(
1123         toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1124 
1125   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1126   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1127   IntrinsicCostAttributes CostAttrs(
1128       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1129       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1130   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1131 }
1132 
1133 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1134   return Intrinsic::getBaseName(VectorIntrinsicID);
1135 }
1136 
1137 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1138   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1139   // Vector predication intrinsics only demand the the first lane the last
1140   // operand (the EVL operand).
1141   return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1142          Op == getOperand(getNumOperands() - 1);
1143 }
1144 
1145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1146 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1147                                    VPSlotTracker &SlotTracker) const {
1148   O << Indent << "WIDEN-INTRINSIC ";
1149   if (ResultTy->isVoidTy()) {
1150     O << "void ";
1151   } else {
1152     printAsOperand(O, SlotTracker);
1153     O << " = ";
1154   }
1155 
1156   O << "call";
1157   printFlags(O);
1158   O << getIntrinsicName() << "(";
1159 
1160   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1161     Op->printAsOperand(O, SlotTracker);
1162   });
1163   O << ")";
1164 }
1165 #endif
1166 
1167 void VPHistogramRecipe::execute(VPTransformState &State) {
1168   State.setDebugLocFrom(getDebugLoc());
1169   IRBuilderBase &Builder = State.Builder;
1170 
1171   Value *Address = State.get(getOperand(0));
1172   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1173   VectorType *VTy = cast<VectorType>(Address->getType());
1174 
1175   // The histogram intrinsic requires a mask even if the recipe doesn't;
1176   // if the mask operand was omitted then all lanes should be executed and
1177   // we just need to synthesize an all-true mask.
1178   Value *Mask = nullptr;
1179   if (VPValue *VPMask = getMask())
1180     Mask = State.get(VPMask);
1181   else
1182     Mask =
1183         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1184 
1185   // If this is a subtract, we want to invert the increment amount. We may
1186   // add a separate intrinsic in future, but for now we'll try this.
1187   if (Opcode == Instruction::Sub)
1188     IncAmt = Builder.CreateNeg(IncAmt);
1189   else
1190     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1191 
1192   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1193                                 {VTy, IncAmt->getType()},
1194                                 {Address, IncAmt, Mask});
1195 }
1196 
1197 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1198                                                VPCostContext &Ctx) const {
1199   // FIXME: Take the gather and scatter into account as well. For now we're
1200   //        generating the same cost as the fallback path, but we'll likely
1201   //        need to create a new TTI method for determining the cost, including
1202   //        whether we can use base + vec-of-smaller-indices or just
1203   //        vec-of-pointers.
1204   assert(VF.isVector() && "Invalid VF for histogram cost");
1205   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1206   VPValue *IncAmt = getOperand(1);
1207   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1208   VectorType *VTy = VectorType::get(IncTy, VF);
1209 
1210   // Assume that a non-constant update value (or a constant != 1) requires
1211   // a multiply, and add that into the cost.
1212   InstructionCost MulCost =
1213       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1214   if (IncAmt->isLiveIn()) {
1215     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1216 
1217     if (CI && CI->getZExtValue() == 1)
1218       MulCost = TTI::TCC_Free;
1219   }
1220 
1221   // Find the cost of the histogram operation itself.
1222   Type *PtrTy = VectorType::get(AddressTy, VF);
1223   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1224   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1225                               Type::getVoidTy(Ctx.LLVMCtx),
1226                               {PtrTy, IncTy, MaskTy});
1227 
1228   // Add the costs together with the add/sub operation.
1229   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1230          Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1231 }
1232 
1233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1234 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1235                               VPSlotTracker &SlotTracker) const {
1236   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1237   getOperand(0)->printAsOperand(O, SlotTracker);
1238 
1239   if (Opcode == Instruction::Sub)
1240     O << ", dec: ";
1241   else {
1242     assert(Opcode == Instruction::Add);
1243     O << ", inc: ";
1244   }
1245   getOperand(1)->printAsOperand(O, SlotTracker);
1246 
1247   if (VPValue *Mask = getMask()) {
1248     O << ", mask: ";
1249     Mask->printAsOperand(O, SlotTracker);
1250   }
1251 }
1252 
1253 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1254                                 VPSlotTracker &SlotTracker) const {
1255   O << Indent << "WIDEN-SELECT ";
1256   printAsOperand(O, SlotTracker);
1257   O << " = select ";
1258   printFlags(O);
1259   getOperand(0)->printAsOperand(O, SlotTracker);
1260   O << ", ";
1261   getOperand(1)->printAsOperand(O, SlotTracker);
1262   O << ", ";
1263   getOperand(2)->printAsOperand(O, SlotTracker);
1264   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1265 }
1266 #endif
1267 
1268 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1269   State.setDebugLocFrom(getDebugLoc());
1270 
1271   // The condition can be loop invariant but still defined inside the
1272   // loop. This means that we can't just use the original 'cond' value.
1273   // We have to take the 'vectorized' value and pick the first lane.
1274   // Instcombine will make this a no-op.
1275   auto *InvarCond =
1276       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1277 
1278   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1279   Value *Op0 = State.get(getOperand(1));
1280   Value *Op1 = State.get(getOperand(2));
1281   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1282   State.set(this, Sel);
1283   if (isa<FPMathOperator>(Sel))
1284     setFlags(cast<Instruction>(Sel));
1285   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1286 }
1287 
1288 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1289                                                  VPCostContext &Ctx) const {
1290   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1291   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1292   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1293   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1294 
1295   VPValue *Op0, *Op1;
1296   using namespace llvm::VPlanPatternMatch;
1297   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1298       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1299        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1300     // select x, y, false --> x & y
1301     // select x, true, y --> x | y
1302     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1303     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1304 
1305     SmallVector<const Value *, 2> Operands;
1306     if (all_of(operands(),
1307                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1308       Operands.append(SI->op_begin(), SI->op_end());
1309     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1310     return Ctx.TTI.getArithmeticInstrCost(
1311         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1312         Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1313   }
1314 
1315   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1316   if (!ScalarCond)
1317     CondTy = VectorType::get(CondTy, VF);
1318 
1319   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1320   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1321     Pred = Cmp->getPredicate();
1322   return Ctx.TTI.getCmpSelInstrCost(
1323       Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1324       {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1325 }
1326 
1327 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1328     const FastMathFlags &FMF) {
1329   AllowReassoc = FMF.allowReassoc();
1330   NoNaNs = FMF.noNaNs();
1331   NoInfs = FMF.noInfs();
1332   NoSignedZeros = FMF.noSignedZeros();
1333   AllowReciprocal = FMF.allowReciprocal();
1334   AllowContract = FMF.allowContract();
1335   ApproxFunc = FMF.approxFunc();
1336 }
1337 
1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1339 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1340   switch (OpType) {
1341   case OperationType::Cmp:
1342     O << " " << CmpInst::getPredicateName(getPredicate());
1343     break;
1344   case OperationType::DisjointOp:
1345     if (DisjointFlags.IsDisjoint)
1346       O << " disjoint";
1347     break;
1348   case OperationType::PossiblyExactOp:
1349     if (ExactFlags.IsExact)
1350       O << " exact";
1351     break;
1352   case OperationType::OverflowingBinOp:
1353     if (WrapFlags.HasNUW)
1354       O << " nuw";
1355     if (WrapFlags.HasNSW)
1356       O << " nsw";
1357     break;
1358   case OperationType::FPMathOp:
1359     getFastMathFlags().print(O);
1360     break;
1361   case OperationType::GEPOp:
1362     if (GEPFlags.isInBounds())
1363       O << " inbounds";
1364     else if (GEPFlags.hasNoUnsignedSignedWrap())
1365       O << " nusw";
1366     if (GEPFlags.hasNoUnsignedWrap())
1367       O << " nuw";
1368     break;
1369   case OperationType::NonNegOp:
1370     if (NonNegFlags.NonNeg)
1371       O << " nneg";
1372     break;
1373   case OperationType::Other:
1374     break;
1375   }
1376   if (getNumOperands() > 0)
1377     O << " ";
1378 }
1379 #endif
1380 
1381 void VPWidenRecipe::execute(VPTransformState &State) {
1382   State.setDebugLocFrom(getDebugLoc());
1383   auto &Builder = State.Builder;
1384   switch (Opcode) {
1385   case Instruction::Call:
1386   case Instruction::Br:
1387   case Instruction::PHI:
1388   case Instruction::GetElementPtr:
1389   case Instruction::Select:
1390     llvm_unreachable("This instruction is handled by a different recipe.");
1391   case Instruction::UDiv:
1392   case Instruction::SDiv:
1393   case Instruction::SRem:
1394   case Instruction::URem:
1395   case Instruction::Add:
1396   case Instruction::FAdd:
1397   case Instruction::Sub:
1398   case Instruction::FSub:
1399   case Instruction::FNeg:
1400   case Instruction::Mul:
1401   case Instruction::FMul:
1402   case Instruction::FDiv:
1403   case Instruction::FRem:
1404   case Instruction::Shl:
1405   case Instruction::LShr:
1406   case Instruction::AShr:
1407   case Instruction::And:
1408   case Instruction::Or:
1409   case Instruction::Xor: {
1410     // Just widen unops and binops.
1411     SmallVector<Value *, 2> Ops;
1412     for (VPValue *VPOp : operands())
1413       Ops.push_back(State.get(VPOp));
1414 
1415     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1416 
1417     if (auto *VecOp = dyn_cast<Instruction>(V))
1418       setFlags(VecOp);
1419 
1420     // Use this vector value for all users of the original instruction.
1421     State.set(this, V);
1422     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1423     break;
1424   }
1425   case Instruction::Freeze: {
1426     Value *Op = State.get(getOperand(0));
1427 
1428     Value *Freeze = Builder.CreateFreeze(Op);
1429     State.set(this, Freeze);
1430     break;
1431   }
1432   case Instruction::ICmp:
1433   case Instruction::FCmp: {
1434     // Widen compares. Generate vector compares.
1435     bool FCmp = Opcode == Instruction::FCmp;
1436     Value *A = State.get(getOperand(0));
1437     Value *B = State.get(getOperand(1));
1438     Value *C = nullptr;
1439     if (FCmp) {
1440       // Propagate fast math flags.
1441       C = Builder.CreateFCmpFMF(
1442           getPredicate(), A, B,
1443           dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1444     } else {
1445       C = Builder.CreateICmp(getPredicate(), A, B);
1446     }
1447     State.set(this, C);
1448     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1449     break;
1450   }
1451   default:
1452     // This instruction is not vectorized by simple widening.
1453     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1454                       << Instruction::getOpcodeName(Opcode));
1455     llvm_unreachable("Unhandled instruction!");
1456   } // end of switch.
1457 
1458 #if !defined(NDEBUG)
1459   // Verify that VPlan type inference results agree with the type of the
1460   // generated values.
1461   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1462              State.get(this)->getType() &&
1463          "inferred type and type from generated instructions do not match");
1464 #endif
1465 }
1466 
1467 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1468                                            VPCostContext &Ctx) const {
1469   switch (Opcode) {
1470   case Instruction::FNeg: {
1471     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1472     return Ctx.TTI.getArithmeticInstrCost(
1473         Opcode, VectorTy, Ctx.CostKind,
1474         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1475         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1476   }
1477 
1478   case Instruction::UDiv:
1479   case Instruction::SDiv:
1480   case Instruction::SRem:
1481   case Instruction::URem:
1482     // More complex computation, let the legacy cost-model handle this for now.
1483     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1484   case Instruction::Add:
1485   case Instruction::FAdd:
1486   case Instruction::Sub:
1487   case Instruction::FSub:
1488   case Instruction::Mul:
1489   case Instruction::FMul:
1490   case Instruction::FDiv:
1491   case Instruction::FRem:
1492   case Instruction::Shl:
1493   case Instruction::LShr:
1494   case Instruction::AShr:
1495   case Instruction::And:
1496   case Instruction::Or:
1497   case Instruction::Xor: {
1498     VPValue *RHS = getOperand(1);
1499     // Certain instructions can be cheaper to vectorize if they have a constant
1500     // second vector operand. One example of this are shifts on x86.
1501     TargetTransformInfo::OperandValueInfo RHSInfo = {
1502         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1503     if (RHS->isLiveIn())
1504       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1505 
1506     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1507         getOperand(1)->isDefinedOutsideLoopRegions())
1508       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1509     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1510     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1511 
1512     SmallVector<const Value *, 4> Operands;
1513     if (CtxI)
1514       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1515     return Ctx.TTI.getArithmeticInstrCost(
1516         Opcode, VectorTy, Ctx.CostKind,
1517         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1518         RHSInfo, Operands, CtxI, &Ctx.TLI);
1519   }
1520   case Instruction::Freeze: {
1521     // This opcode is unknown. Assume that it is the same as 'mul'.
1522     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1523     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
1524                                           Ctx.CostKind);
1525   }
1526   case Instruction::ICmp:
1527   case Instruction::FCmp: {
1528     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1529     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1530     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1531                                       Ctx.CostKind,
1532                                       {TTI::OK_AnyValue, TTI::OP_None},
1533                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1534   }
1535   default:
1536     llvm_unreachable("Unsupported opcode for instruction");
1537   }
1538 }
1539 
1540 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1541   unsigned Opcode = getOpcode();
1542   // TODO: Support other opcodes
1543   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1544     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1545 
1546   State.setDebugLocFrom(getDebugLoc());
1547 
1548   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1549          "VPWidenEVLRecipe should not be used for scalars");
1550 
1551   VPValue *EVL = getEVL();
1552   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1553   IRBuilderBase &BuilderIR = State.Builder;
1554   VectorBuilder Builder(BuilderIR);
1555   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1556 
1557   SmallVector<Value *, 4> Ops;
1558   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1559     VPValue *VPOp = getOperand(I);
1560     Ops.push_back(State.get(VPOp));
1561   }
1562 
1563   Builder.setMask(Mask).setEVL(EVLArg);
1564   Value *VPInst =
1565       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1566   // Currently vp-intrinsics only accept FMF flags.
1567   // TODO: Enable other flags when support is added.
1568   if (isa<FPMathOperator>(VPInst))
1569     setFlags(cast<Instruction>(VPInst));
1570 
1571   State.set(this, VPInst);
1572   State.addMetadata(VPInst,
1573                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1574 }
1575 
1576 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1577 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1578                           VPSlotTracker &SlotTracker) const {
1579   O << Indent << "WIDEN ";
1580   printAsOperand(O, SlotTracker);
1581   O << " = " << Instruction::getOpcodeName(Opcode);
1582   printFlags(O);
1583   printOperands(O, SlotTracker);
1584 }
1585 
1586 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1587                              VPSlotTracker &SlotTracker) const {
1588   O << Indent << "WIDEN ";
1589   printAsOperand(O, SlotTracker);
1590   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1591   printFlags(O);
1592   printOperands(O, SlotTracker);
1593 }
1594 #endif
1595 
1596 void VPWidenCastRecipe::execute(VPTransformState &State) {
1597   State.setDebugLocFrom(getDebugLoc());
1598   auto &Builder = State.Builder;
1599   /// Vectorize casts.
1600   assert(State.VF.isVector() && "Not vectorizing?");
1601   Type *DestTy = VectorType::get(getResultType(), State.VF);
1602   VPValue *Op = getOperand(0);
1603   Value *A = State.get(Op);
1604   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1605   State.set(this, Cast);
1606   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1607   if (auto *CastOp = dyn_cast<Instruction>(Cast))
1608     setFlags(CastOp);
1609 }
1610 
1611 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1612                                                VPCostContext &Ctx) const {
1613   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
1614   // the legacy cost model, including truncates/extends when evaluating a
1615   // reduction in a smaller type.
1616   if (!getUnderlyingValue())
1617     return 0;
1618   // Computes the CastContextHint from a recipes that may access memory.
1619   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1620     if (VF.isScalar())
1621       return TTI::CastContextHint::Normal;
1622     if (isa<VPInterleaveRecipe>(R))
1623       return TTI::CastContextHint::Interleave;
1624     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
1625       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1626                                              : TTI::CastContextHint::Normal;
1627     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1628     if (WidenMemoryRecipe == nullptr)
1629       return TTI::CastContextHint::None;
1630     if (!WidenMemoryRecipe->isConsecutive())
1631       return TTI::CastContextHint::GatherScatter;
1632     if (WidenMemoryRecipe->isReverse())
1633       return TTI::CastContextHint::Reversed;
1634     if (WidenMemoryRecipe->isMasked())
1635       return TTI::CastContextHint::Masked;
1636     return TTI::CastContextHint::Normal;
1637   };
1638 
1639   VPValue *Operand = getOperand(0);
1640   TTI::CastContextHint CCH = TTI::CastContextHint::None;
1641   // For Trunc/FPTrunc, get the context from the only user.
1642   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
1643       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
1644     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
1645       CCH = ComputeCCH(StoreRecipe);
1646   }
1647   // For Z/Sext, get the context from the operand.
1648   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1649            Opcode == Instruction::FPExt) {
1650     if (Operand->isLiveIn())
1651       CCH = TTI::CastContextHint::Normal;
1652     else if (Operand->getDefiningRecipe())
1653       CCH = ComputeCCH(Operand->getDefiningRecipe());
1654   }
1655 
1656   auto *SrcTy =
1657       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
1658   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
1659   // Arm TTI will use the underlying instruction to determine the cost.
1660   return Ctx.TTI.getCastInstrCost(
1661       Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
1662       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1663 }
1664 
1665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1666 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1667                               VPSlotTracker &SlotTracker) const {
1668   O << Indent << "WIDEN-CAST ";
1669   printAsOperand(O, SlotTracker);
1670   O << " = " << Instruction::getOpcodeName(Opcode);
1671   printFlags(O);
1672   printOperands(O, SlotTracker);
1673   O << " to " << *getResultType();
1674 }
1675 #endif
1676 
1677 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
1678                                                VPCostContext &Ctx) const {
1679   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
1680 }
1681 
1682 /// This function adds
1683 /// (0 * Step, 1 * Step, 2 * Step, ...)
1684 /// to each vector element of Val.
1685 /// \p Opcode is relevant for FP induction variable.
1686 static Value *getStepVector(Value *Val, Value *Step,
1687                             Instruction::BinaryOps BinOp, ElementCount VF,
1688                             IRBuilderBase &Builder) {
1689   assert(VF.isVector() && "only vector VFs are supported");
1690 
1691   // Create and check the types.
1692   auto *ValVTy = cast<VectorType>(Val->getType());
1693   ElementCount VLen = ValVTy->getElementCount();
1694 
1695   Type *STy = Val->getType()->getScalarType();
1696   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1697          "Induction Step must be an integer or FP");
1698   assert(Step->getType() == STy && "Step has wrong type");
1699 
1700   SmallVector<Constant *, 8> Indices;
1701 
1702   // Create a vector of consecutive numbers from zero to VF.
1703   VectorType *InitVecValVTy = ValVTy;
1704   if (STy->isFloatingPointTy()) {
1705     Type *InitVecValSTy =
1706         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1707     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1708   }
1709   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1710 
1711   if (STy->isIntegerTy()) {
1712     Step = Builder.CreateVectorSplat(VLen, Step);
1713     assert(Step->getType() == Val->getType() && "Invalid step vec");
1714     // FIXME: The newly created binary instructions should contain nsw/nuw
1715     // flags, which can be found from the original scalar operations.
1716     Step = Builder.CreateMul(InitVec, Step);
1717     return Builder.CreateAdd(Val, Step, "induction");
1718   }
1719 
1720   // Floating point induction.
1721   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1722          "Binary Opcode should be specified for FP induction");
1723   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1724 
1725   Step = Builder.CreateVectorSplat(VLen, Step);
1726   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1727   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1728 }
1729 
1730 /// A helper function that returns an integer or floating-point constant with
1731 /// value C.
1732 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1733   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1734                            : ConstantFP::get(Ty, C);
1735 }
1736 
1737 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1738   assert(!State.Lane && "Int or FP induction being replicated.");
1739 
1740   Value *Start = getStartValue()->getLiveInIRValue();
1741   const InductionDescriptor &ID = getInductionDescriptor();
1742   TruncInst *Trunc = getTruncInst();
1743   IRBuilderBase &Builder = State.Builder;
1744   assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
1745          "Types must match");
1746   assert(State.VF.isVector() && "must have vector VF");
1747 
1748   // The value from the original loop to which we are mapping the new induction
1749   // variable.
1750   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
1751 
1752   // Fast-math-flags propagate from the original induction instruction.
1753   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1754   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1755     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1756 
1757   // Now do the actual transformations, and start with fetching the step value.
1758   Value *Step = State.get(getStepValue(), VPLane(0));
1759 
1760   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1761          "Expected either an induction phi-node or a truncate of it!");
1762 
1763   // Construct the initial value of the vector IV in the vector loop preheader
1764   auto CurrIP = Builder.saveIP();
1765   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1766   Builder.SetInsertPoint(VectorPH->getTerminator());
1767   if (isa<TruncInst>(EntryVal)) {
1768     assert(Start->getType()->isIntegerTy() &&
1769            "Truncation requires an integer type");
1770     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1771     Step = Builder.CreateTrunc(Step, TruncType);
1772     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1773   }
1774 
1775   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1776   Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
1777                                       State.VF, State.Builder);
1778 
1779   // We create vector phi nodes for both integer and floating-point induction
1780   // variables. Here, we determine the kind of arithmetic we will perform.
1781   Instruction::BinaryOps AddOp;
1782   Instruction::BinaryOps MulOp;
1783   if (Step->getType()->isIntegerTy()) {
1784     AddOp = Instruction::Add;
1785     MulOp = Instruction::Mul;
1786   } else {
1787     AddOp = ID.getInductionOpcode();
1788     MulOp = Instruction::FMul;
1789   }
1790 
1791   Value *SplatVF;
1792   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1793     // The recipe has been unrolled. In that case, fetch the splat value for the
1794     // induction increment.
1795     SplatVF = State.get(SplatVFOperand);
1796   } else {
1797     // Multiply the vectorization factor by the step using integer or
1798     // floating-point arithmetic as appropriate.
1799     Type *StepType = Step->getType();
1800     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1801     if (Step->getType()->isFloatingPointTy())
1802       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1803     else
1804       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1805     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1806 
1807     // Create a vector splat to use in the induction update.
1808     SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
1809   }
1810 
1811   Builder.restoreIP(CurrIP);
1812 
1813   // We may need to add the step a number of times, depending on the unroll
1814   // factor. The last of those goes into the PHI.
1815   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1816   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1817   VecInd->setDebugLoc(getDebugLoc());
1818   State.set(this, VecInd);
1819 
1820   Instruction *LastInduction = cast<Instruction>(
1821       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1822   if (isa<TruncInst>(EntryVal))
1823     State.addMetadata(LastInduction, EntryVal);
1824   LastInduction->setDebugLoc(getDebugLoc());
1825 
1826   VecInd->addIncoming(SteppedStart, VectorPH);
1827   // Add induction update using an incorrect block temporarily. The phi node
1828   // will be fixed after VPlan execution. Note that at this point the latch
1829   // block cannot be used, as it does not exist yet.
1830   // TODO: Model increment value in VPlan, by turning the recipe into a
1831   // multi-def and a subclass of VPHeaderPHIRecipe.
1832   VecInd->addIncoming(LastInduction, VectorPH);
1833 }
1834 
1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1836 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1837                                           VPSlotTracker &SlotTracker) const {
1838   O << Indent;
1839   printAsOperand(O, SlotTracker);
1840   O << " = WIDEN-INDUCTION  ";
1841   printOperands(O, SlotTracker);
1842 
1843   if (auto *TI = getTruncInst())
1844     O << " (truncated to " << *TI->getType() << ")";
1845 }
1846 #endif
1847 
1848 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1849   // The step may be defined by a recipe in the preheader (e.g. if it requires
1850   // SCEV expansion), but for the canonical induction the step is required to be
1851   // 1, which is represented as live-in.
1852   if (getStepValue()->getDefiningRecipe())
1853     return false;
1854   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1855   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1856   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1857   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1858          getScalarType() == CanIV->getScalarType();
1859 }
1860 
1861 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1862 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1863                               VPSlotTracker &SlotTracker) const {
1864   O << Indent;
1865   printAsOperand(O, SlotTracker);
1866   O << " = DERIVED-IV ";
1867   getStartValue()->printAsOperand(O, SlotTracker);
1868   O << " + ";
1869   getOperand(1)->printAsOperand(O, SlotTracker);
1870   O << " * ";
1871   getStepValue()->printAsOperand(O, SlotTracker);
1872 }
1873 #endif
1874 
1875 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1876   // Fast-math-flags propagate from the original induction instruction.
1877   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1878   if (hasFastMathFlags())
1879     State.Builder.setFastMathFlags(getFastMathFlags());
1880 
1881   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1882   /// variable on which to base the steps, \p Step is the size of the step.
1883 
1884   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1885   Value *Step = State.get(getStepValue(), VPLane(0));
1886   IRBuilderBase &Builder = State.Builder;
1887 
1888   // Ensure step has the same type as that of scalar IV.
1889   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1890   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1891 
1892   // We build scalar steps for both integer and floating-point induction
1893   // variables. Here, we determine the kind of arithmetic we will perform.
1894   Instruction::BinaryOps AddOp;
1895   Instruction::BinaryOps MulOp;
1896   if (BaseIVTy->isIntegerTy()) {
1897     AddOp = Instruction::Add;
1898     MulOp = Instruction::Mul;
1899   } else {
1900     AddOp = InductionOpcode;
1901     MulOp = Instruction::FMul;
1902   }
1903 
1904   // Determine the number of scalars we need to generate for each unroll
1905   // iteration.
1906   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1907   // Compute the scalar steps and save the results in State.
1908   Type *IntStepTy =
1909       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1910   Type *VecIVTy = nullptr;
1911   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1912   if (!FirstLaneOnly && State.VF.isScalable()) {
1913     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1914     UnitStepVec =
1915         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1916     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1917     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1918   }
1919 
1920   unsigned StartLane = 0;
1921   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1922   if (State.Lane) {
1923     StartLane = State.Lane->getKnownLane();
1924     EndLane = StartLane + 1;
1925   }
1926   Value *StartIdx0 =
1927       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1928 
1929   if (!FirstLaneOnly && State.VF.isScalable()) {
1930     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1931     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1932     if (BaseIVTy->isFloatingPointTy())
1933       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1934     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1935     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1936     State.set(this, Add);
1937     // It's useful to record the lane values too for the known minimum number
1938     // of elements so we do those below. This improves the code quality when
1939     // trying to extract the first element, for example.
1940   }
1941 
1942   if (BaseIVTy->isFloatingPointTy())
1943     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1944 
1945   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1946     Value *StartIdx = Builder.CreateBinOp(
1947         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1948     // The step returned by `createStepForVF` is a runtime-evaluated value
1949     // when VF is scalable. Otherwise, it should be folded into a Constant.
1950     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1951            "Expected StartIdx to be folded to a constant when VF is not "
1952            "scalable");
1953     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1954     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1955     State.set(this, Add, VPLane(Lane));
1956   }
1957 }
1958 
1959 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1960 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1961                                   VPSlotTracker &SlotTracker) const {
1962   O << Indent;
1963   printAsOperand(O, SlotTracker);
1964   O << " = SCALAR-STEPS ";
1965   printOperands(O, SlotTracker);
1966 }
1967 #endif
1968 
1969 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1970   assert(State.VF.isVector() && "not widening");
1971   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1972   // Construct a vector GEP by widening the operands of the scalar GEP as
1973   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1974   // results in a vector of pointers when at least one operand of the GEP
1975   // is vector-typed. Thus, to keep the representation compact, we only use
1976   // vector-typed operands for loop-varying values.
1977 
1978   if (areAllOperandsInvariant()) {
1979     // If we are vectorizing, but the GEP has only loop-invariant operands,
1980     // the GEP we build (by only using vector-typed operands for
1981     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1982     // produce a vector of pointers, we need to either arbitrarily pick an
1983     // operand to broadcast, or broadcast a clone of the original GEP.
1984     // Here, we broadcast a clone of the original.
1985     //
1986     // TODO: If at some point we decide to scalarize instructions having
1987     //       loop-invariant operands, this special case will no longer be
1988     //       required. We would add the scalarization decision to
1989     //       collectLoopScalars() and teach getVectorValue() to broadcast
1990     //       the lane-zero scalar value.
1991     SmallVector<Value *> Ops;
1992     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1993       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1994 
1995     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1996                                            ArrayRef(Ops).drop_front(), "",
1997                                            getGEPNoWrapFlags());
1998     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1999     State.set(this, Splat);
2000     State.addMetadata(Splat, GEP);
2001   } else {
2002     // If the GEP has at least one loop-varying operand, we are sure to
2003     // produce a vector of pointers unless VF is scalar.
2004     // The pointer operand of the new GEP. If it's loop-invariant, we
2005     // won't broadcast it.
2006     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
2007                                          : State.get(getOperand(0));
2008 
2009     // Collect all the indices for the new GEP. If any index is
2010     // loop-invariant, we won't broadcast it.
2011     SmallVector<Value *, 4> Indices;
2012     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2013       VPValue *Operand = getOperand(I);
2014       if (isIndexLoopInvariant(I - 1))
2015         Indices.push_back(State.get(Operand, VPLane(0)));
2016       else
2017         Indices.push_back(State.get(Operand));
2018     }
2019 
2020     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2021     // but it should be a vector, otherwise.
2022     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
2023                                            Indices, "", getGEPNoWrapFlags());
2024     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2025            "NewGEP is not a pointer vector");
2026     State.set(this, NewGEP);
2027     State.addMetadata(NewGEP, GEP);
2028   }
2029 }
2030 
2031 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2032 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2033                              VPSlotTracker &SlotTracker) const {
2034   O << Indent << "WIDEN-GEP ";
2035   O << (isPointerLoopInvariant() ? "Inv" : "Var");
2036   for (size_t I = 0; I < getNumOperands() - 1; ++I)
2037     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2038 
2039   O << " ";
2040   printAsOperand(O, SlotTracker);
2041   O << " = getelementptr";
2042   printFlags(O);
2043   printOperands(O, SlotTracker);
2044 }
2045 #endif
2046 
2047 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
2048                            unsigned CurrentPart, IRBuilderBase &Builder) {
2049   // Use i32 for the gep index type when the value is constant,
2050   // or query DataLayout for a more suitable index type otherwise.
2051   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2052   return IsScalable && (IsReverse || CurrentPart > 0)
2053              ? DL.getIndexType(Builder.getPtrTy(0))
2054              : Builder.getInt32Ty();
2055 }
2056 
2057 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
2058   auto &Builder = State.Builder;
2059   State.setDebugLocFrom(getDebugLoc());
2060   unsigned CurrentPart = getUnrollPart(*this);
2061   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2062                                 CurrentPart, Builder);
2063 
2064   // The wide store needs to start at the last vector element.
2065   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2066   if (IndexTy != RunTimeVF->getType())
2067     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2068   // NumElt = -CurrentPart * RunTimeVF
2069   Value *NumElt = Builder.CreateMul(
2070       ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
2071   // LastLane = 1 - RunTimeVF
2072   Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
2073   Value *Ptr = State.get(getOperand(0), VPLane(0));
2074   Value *ResultPtr =
2075       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2076   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2077                                 getGEPNoWrapFlags());
2078 
2079   State.set(this, ResultPtr, /*IsScalar*/ true);
2080 }
2081 
2082 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2083 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2084                                          VPSlotTracker &SlotTracker) const {
2085   O << Indent;
2086   printAsOperand(O, SlotTracker);
2087   O << " = reverse-vector-pointer";
2088   printFlags(O);
2089   printOperands(O, SlotTracker);
2090 }
2091 #endif
2092 
2093 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2094   auto &Builder = State.Builder;
2095   State.setDebugLocFrom(getDebugLoc());
2096   unsigned CurrentPart = getUnrollPart(*this);
2097   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2098                                 CurrentPart, Builder);
2099   Value *Ptr = State.get(getOperand(0), VPLane(0));
2100 
2101   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2102   Value *ResultPtr =
2103       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2104 
2105   State.set(this, ResultPtr, /*IsScalar*/ true);
2106 }
2107 
2108 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2109 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2110                                   VPSlotTracker &SlotTracker) const {
2111   O << Indent;
2112   printAsOperand(O, SlotTracker);
2113   O << " = vector-pointer ";
2114 
2115   printOperands(O, SlotTracker);
2116 }
2117 #endif
2118 
2119 void VPBlendRecipe::execute(VPTransformState &State) {
2120   assert(isNormalized() && "Expected blend to be normalized!");
2121   State.setDebugLocFrom(getDebugLoc());
2122   // We know that all PHIs in non-header blocks are converted into
2123   // selects, so we don't have to worry about the insertion order and we
2124   // can just use the builder.
2125   // At this point we generate the predication tree. There may be
2126   // duplications since this is a simple recursive scan, but future
2127   // optimizations will clean it up.
2128 
2129   unsigned NumIncoming = getNumIncomingValues();
2130 
2131   // Generate a sequence of selects of the form:
2132   // SELECT(Mask3, In3,
2133   //        SELECT(Mask2, In2,
2134   //               SELECT(Mask1, In1,
2135   //                      In0)))
2136   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2137   // are essentially undef are taken from In0.
2138   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2139   Value *Result = nullptr;
2140   for (unsigned In = 0; In < NumIncoming; ++In) {
2141     // We might have single edge PHIs (blocks) - use an identity
2142     // 'select' for the first PHI operand.
2143     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2144     if (In == 0)
2145       Result = In0; // Initialize with the first incoming value.
2146     else {
2147       // Select between the current value and the previous incoming edge
2148       // based on the incoming mask.
2149       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2150       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2151     }
2152   }
2153   State.set(this, Result, OnlyFirstLaneUsed);
2154 }
2155 
2156 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2157                                            VPCostContext &Ctx) const {
2158   // Handle cases where only the first lane is used the same way as the legacy
2159   // cost model.
2160   if (vputils::onlyFirstLaneUsed(this))
2161     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2162 
2163   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2164   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2165   return (getNumIncomingValues() - 1) *
2166          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2167                                     CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2168 }
2169 
2170 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2171 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2172                           VPSlotTracker &SlotTracker) const {
2173   O << Indent << "BLEND ";
2174   printAsOperand(O, SlotTracker);
2175   O << " =";
2176   if (getNumIncomingValues() == 1) {
2177     // Not a User of any mask: not really blending, this is a
2178     // single-predecessor phi.
2179     O << " ";
2180     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2181   } else {
2182     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2183       O << " ";
2184       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2185       if (I == 0)
2186         continue;
2187       O << "/";
2188       getMask(I)->printAsOperand(O, SlotTracker);
2189     }
2190   }
2191 }
2192 #endif
2193 
2194 void VPReductionRecipe::execute(VPTransformState &State) {
2195   assert(!State.Lane && "Reduction being replicated.");
2196   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2197   RecurKind Kind = RdxDesc.getRecurrenceKind();
2198   // Propagate the fast-math flags carried by the underlying instruction.
2199   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2200   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2201   State.setDebugLocFrom(getDebugLoc());
2202   Value *NewVecOp = State.get(getVecOp());
2203   if (VPValue *Cond = getCondOp()) {
2204     Value *NewCond = State.get(Cond, State.VF.isScalar());
2205     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2206     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2207 
2208     Value *Start;
2209     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
2210       Start = RdxDesc.getRecurrenceStartValue();
2211     else
2212       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
2213                                           RdxDesc.getFastMathFlags());
2214     if (State.VF.isVector())
2215       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2216 
2217     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2218     NewVecOp = Select;
2219   }
2220   Value *NewRed;
2221   Value *NextInChain;
2222   if (IsOrdered) {
2223     if (State.VF.isVector())
2224       NewRed =
2225           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
2226     else
2227       NewRed = State.Builder.CreateBinOp(
2228           (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp);
2229     PrevInChain = NewRed;
2230     NextInChain = NewRed;
2231   } else {
2232     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2233     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
2234     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2235       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
2236                                    NewRed, PrevInChain);
2237     else
2238       NextInChain = State.Builder.CreateBinOp(
2239           (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain);
2240   }
2241   State.set(this, NextInChain, /*IsScalar*/ true);
2242 }
2243 
2244 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2245   assert(!State.Lane && "Reduction being replicated.");
2246 
2247   auto &Builder = State.Builder;
2248   // Propagate the fast-math flags carried by the underlying instruction.
2249   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2250   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2251   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2252 
2253   RecurKind Kind = RdxDesc.getRecurrenceKind();
2254   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2255   Value *VecOp = State.get(getVecOp());
2256   Value *EVL = State.get(getEVL(), VPLane(0));
2257 
2258   VectorBuilder VBuilder(Builder);
2259   VBuilder.setEVL(EVL);
2260   Value *Mask;
2261   // TODO: move the all-true mask generation into VectorBuilder.
2262   if (VPValue *CondOp = getCondOp())
2263     Mask = State.get(CondOp);
2264   else
2265     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2266   VBuilder.setMask(Mask);
2267 
2268   Value *NewRed;
2269   if (isOrdered()) {
2270     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2271   } else {
2272     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2273     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2274       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2275     else
2276       NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(),
2277                                    NewRed, Prev);
2278   }
2279   State.set(this, NewRed, /*IsScalar*/ true);
2280 }
2281 
2282 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2283                                                VPCostContext &Ctx) const {
2284   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
2285   Type *ElementTy = Ctx.Types.inferScalarType(this);
2286   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2287   unsigned Opcode = RdxDesc.getOpcode();
2288 
2289   // TODO: Support any-of and in-loop reductions.
2290   assert(
2291       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2292        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2293       "Any-of reduction not implemented in VPlan-based cost model currently.");
2294   assert(
2295       (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
2296        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2297       "In-loop reduction not implemented in VPlan-based cost model currently.");
2298 
2299   assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() &&
2300          "Inferred type and recurrence type mismatch.");
2301 
2302   // Cost = Reduction cost + BinOp cost
2303   InstructionCost Cost =
2304       Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
2305   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2306     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2307     return Cost + Ctx.TTI.getMinMaxReductionCost(
2308                       Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
2309   }
2310 
2311   return Cost + Ctx.TTI.getArithmeticReductionCost(
2312                     Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
2313 }
2314 
2315 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2316 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2317                               VPSlotTracker &SlotTracker) const {
2318   O << Indent << "REDUCE ";
2319   printAsOperand(O, SlotTracker);
2320   O << " = ";
2321   getChainOp()->printAsOperand(O, SlotTracker);
2322   O << " +";
2323   if (isa<FPMathOperator>(getUnderlyingInstr()))
2324     O << getUnderlyingInstr()->getFastMathFlags();
2325   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2326   getVecOp()->printAsOperand(O, SlotTracker);
2327   if (isConditional()) {
2328     O << ", ";
2329     getCondOp()->printAsOperand(O, SlotTracker);
2330   }
2331   O << ")";
2332   if (RdxDesc.IntermediateStore)
2333     O << " (with final reduction value stored in invariant address sank "
2334          "outside of loop)";
2335 }
2336 
2337 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2338                                  VPSlotTracker &SlotTracker) const {
2339   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2340   O << Indent << "REDUCE ";
2341   printAsOperand(O, SlotTracker);
2342   O << " = ";
2343   getChainOp()->printAsOperand(O, SlotTracker);
2344   O << " +";
2345   if (isa<FPMathOperator>(getUnderlyingInstr()))
2346     O << getUnderlyingInstr()->getFastMathFlags();
2347   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2348   getVecOp()->printAsOperand(O, SlotTracker);
2349   O << ", ";
2350   getEVL()->printAsOperand(O, SlotTracker);
2351   if (isConditional()) {
2352     O << ", ";
2353     getCondOp()->printAsOperand(O, SlotTracker);
2354   }
2355   O << ")";
2356   if (RdxDesc.IntermediateStore)
2357     O << " (with final reduction value stored in invariant address sank "
2358          "outside of loop)";
2359 }
2360 #endif
2361 
2362 bool VPReplicateRecipe::shouldPack() const {
2363   // Find if the recipe is used by a widened recipe via an intervening
2364   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2365   return any_of(users(), [](const VPUser *U) {
2366     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2367       return any_of(PredR->users(), [PredR](const VPUser *U) {
2368         return !U->usesScalars(PredR);
2369       });
2370     return false;
2371   });
2372 }
2373 
2374 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2375                                                VPCostContext &Ctx) const {
2376   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2377   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2378   // transform, avoid computing their cost multiple times for now.
2379   Ctx.SkipCostComputation.insert(UI);
2380   return Ctx.getLegacyCost(UI, VF);
2381 }
2382 
2383 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2384 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2385                               VPSlotTracker &SlotTracker) const {
2386   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2387 
2388   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2389     printAsOperand(O, SlotTracker);
2390     O << " = ";
2391   }
2392   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2393     O << "call";
2394     printFlags(O);
2395     O << "@" << CB->getCalledFunction()->getName() << "(";
2396     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2397                     O, [&O, &SlotTracker](VPValue *Op) {
2398                       Op->printAsOperand(O, SlotTracker);
2399                     });
2400     O << ")";
2401   } else {
2402     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2403     printFlags(O);
2404     printOperands(O, SlotTracker);
2405   }
2406 
2407   if (shouldPack())
2408     O << " (S->V)";
2409 }
2410 #endif
2411 
2412 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2413   State.setDebugLocFrom(getDebugLoc());
2414   assert(vputils::onlyFirstLaneUsed(this) &&
2415          "Codegen only implemented for first lane.");
2416   switch (Opcode) {
2417   case Instruction::SExt:
2418   case Instruction::ZExt:
2419   case Instruction::Trunc: {
2420     // Note: SExt/ZExt not used yet.
2421     Value *Op = State.get(getOperand(0), VPLane(0));
2422     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2423   }
2424   default:
2425     llvm_unreachable("opcode not implemented yet");
2426   }
2427 }
2428 
2429 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2430   State.set(this, generate(State), VPLane(0));
2431 }
2432 
2433 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2434 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2435                                 VPSlotTracker &SlotTracker) const {
2436   O << Indent << "SCALAR-CAST ";
2437   printAsOperand(O, SlotTracker);
2438   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2439   printOperands(O, SlotTracker);
2440   O << " to " << *ResultTy;
2441 }
2442 #endif
2443 
2444 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2445   assert(State.Lane && "Branch on Mask works only on single instance.");
2446 
2447 
2448   Value *ConditionBit = nullptr;
2449   VPValue *BlockInMask = getMask();
2450   if (BlockInMask)
2451     ConditionBit = State.get(BlockInMask, *State.Lane);
2452   else // Block in mask is all-one.
2453     ConditionBit = State.Builder.getTrue();
2454 
2455   // Replace the temporary unreachable terminator with a new conditional branch,
2456   // whose two destinations will be set later when they are created.
2457   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2458   assert(isa<UnreachableInst>(CurrentTerminator) &&
2459          "Expected to replace unreachable terminator with conditional branch.");
2460   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2461   CondBr->setSuccessor(0, nullptr);
2462   ReplaceInstWithInst(CurrentTerminator, CondBr);
2463 }
2464 
2465 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2466                                                   VPCostContext &Ctx) const {
2467   // The legacy cost model doesn't assign costs to branches for individual
2468   // replicate regions. Match the current behavior in the VPlan cost model for
2469   // now.
2470   return 0;
2471 }
2472 
2473 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2474   State.setDebugLocFrom(getDebugLoc());
2475   assert(State.Lane && "Predicated instruction PHI works per instance.");
2476   Instruction *ScalarPredInst =
2477       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2478   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2479   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2480   assert(PredicatingBB && "Predicated block has no single predecessor.");
2481   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2482          "operand must be VPReplicateRecipe");
2483 
2484   // By current pack/unpack logic we need to generate only a single phi node: if
2485   // a vector value for the predicated instruction exists at this point it means
2486   // the instruction has vector users only, and a phi for the vector value is
2487   // needed. In this case the recipe of the predicated instruction is marked to
2488   // also do that packing, thereby "hoisting" the insert-element sequence.
2489   // Otherwise, a phi node for the scalar value is needed.
2490   if (State.hasVectorValue(getOperand(0))) {
2491     Value *VectorValue = State.get(getOperand(0));
2492     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2493     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2494     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2495     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2496     if (State.hasVectorValue(this))
2497       State.reset(this, VPhi);
2498     else
2499       State.set(this, VPhi);
2500     // NOTE: Currently we need to update the value of the operand, so the next
2501     // predicated iteration inserts its generated value in the correct vector.
2502     State.reset(getOperand(0), VPhi);
2503   } else {
2504     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
2505       return;
2506 
2507     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2508     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2509     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2510                      PredicatingBB);
2511     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2512     if (State.hasScalarValue(this, *State.Lane))
2513       State.reset(this, Phi, *State.Lane);
2514     else
2515       State.set(this, Phi, *State.Lane);
2516     // NOTE: Currently we need to update the value of the operand, so the next
2517     // predicated iteration inserts its generated value in the correct vector.
2518     State.reset(getOperand(0), Phi, *State.Lane);
2519   }
2520 }
2521 
2522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2523 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2524                                 VPSlotTracker &SlotTracker) const {
2525   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2526   printAsOperand(O, SlotTracker);
2527   O << " = ";
2528   printOperands(O, SlotTracker);
2529 }
2530 #endif
2531 
2532 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2533                                                  VPCostContext &Ctx) const {
2534   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2535   const Align Alignment =
2536       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2537   unsigned AS =
2538       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2539 
2540   if (!Consecutive) {
2541     // TODO: Using the original IR may not be accurate.
2542     // Currently, ARM will use the underlying IR to calculate gather/scatter
2543     // instruction cost.
2544     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2545     assert(!Reverse &&
2546            "Inconsecutive memory access should not have the order.");
2547     return Ctx.TTI.getAddressComputationCost(Ty) +
2548            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2549                                           IsMasked, Alignment, Ctx.CostKind,
2550                                           &Ingredient);
2551   }
2552 
2553   InstructionCost Cost = 0;
2554   if (IsMasked) {
2555     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2556                                           AS, Ctx.CostKind);
2557   } else {
2558     TTI::OperandValueInfo OpInfo =
2559         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2560     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2561                                     Ctx.CostKind, OpInfo, &Ingredient);
2562   }
2563   if (!Reverse)
2564     return Cost;
2565 
2566   return Cost +=
2567          Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2568                                 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
2569 }
2570 
2571 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2572   auto *LI = cast<LoadInst>(&Ingredient);
2573 
2574   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2575   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2576   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2577   bool CreateGather = !isConsecutive();
2578 
2579   auto &Builder = State.Builder;
2580   State.setDebugLocFrom(getDebugLoc());
2581   Value *Mask = nullptr;
2582   if (auto *VPMask = getMask()) {
2583     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2584     // of a null all-one mask is a null mask.
2585     Mask = State.get(VPMask);
2586     if (isReverse())
2587       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2588   }
2589 
2590   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2591   Value *NewLI;
2592   if (CreateGather) {
2593     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2594                                        "wide.masked.gather");
2595   } else if (Mask) {
2596     NewLI =
2597         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2598                                  PoisonValue::get(DataTy), "wide.masked.load");
2599   } else {
2600     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2601   }
2602   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2603   State.addMetadata(NewLI, LI);
2604   if (Reverse)
2605     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2606   State.set(this, NewLI);
2607 }
2608 
2609 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2610 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2611                               VPSlotTracker &SlotTracker) const {
2612   O << Indent << "WIDEN ";
2613   printAsOperand(O, SlotTracker);
2614   O << " = load ";
2615   printOperands(O, SlotTracker);
2616 }
2617 #endif
2618 
2619 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2620 /// dependence w/o affecting the result.
2621 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2622                                      Value *EVL, const Twine &Name) {
2623   VectorType *ValTy = cast<VectorType>(Operand->getType());
2624   Value *AllTrueMask =
2625       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2626   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2627                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2628 }
2629 
2630 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2631   auto *LI = cast<LoadInst>(&Ingredient);
2632 
2633   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2634   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2635   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2636   bool CreateGather = !isConsecutive();
2637 
2638   auto &Builder = State.Builder;
2639   State.setDebugLocFrom(getDebugLoc());
2640   CallInst *NewLI;
2641   Value *EVL = State.get(getEVL(), VPLane(0));
2642   Value *Addr = State.get(getAddr(), !CreateGather);
2643   Value *Mask = nullptr;
2644   if (VPValue *VPMask = getMask()) {
2645     Mask = State.get(VPMask);
2646     if (isReverse())
2647       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2648   } else {
2649     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2650   }
2651 
2652   if (CreateGather) {
2653     NewLI =
2654         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2655                                 nullptr, "wide.masked.gather");
2656   } else {
2657     VectorBuilder VBuilder(Builder);
2658     VBuilder.setEVL(EVL).setMask(Mask);
2659     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2660         Instruction::Load, DataTy, Addr, "vp.op.load"));
2661   }
2662   NewLI->addParamAttr(
2663       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2664   State.addMetadata(NewLI, LI);
2665   Instruction *Res = NewLI;
2666   if (isReverse())
2667     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2668   State.set(this, Res);
2669 }
2670 
2671 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2672                                                   VPCostContext &Ctx) const {
2673   if (!Consecutive || IsMasked)
2674     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2675 
2676   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2677   // here because the EVL recipes using EVL to replace the tail mask. But in the
2678   // legacy model, it will always calculate the cost of mask.
2679   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2680   // don't need to compare to the legacy cost model.
2681   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2682   const Align Alignment =
2683       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2684   unsigned AS =
2685       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2686   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2687       Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
2688   if (!Reverse)
2689     return Cost;
2690 
2691   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2692                                        cast<VectorType>(Ty), {}, Ctx.CostKind,
2693                                        0);
2694 }
2695 
2696 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2697 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2698                                  VPSlotTracker &SlotTracker) const {
2699   O << Indent << "WIDEN ";
2700   printAsOperand(O, SlotTracker);
2701   O << " = vp.load ";
2702   printOperands(O, SlotTracker);
2703 }
2704 #endif
2705 
2706 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2707   auto *SI = cast<StoreInst>(&Ingredient);
2708 
2709   VPValue *StoredVPValue = getStoredValue();
2710   bool CreateScatter = !isConsecutive();
2711   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2712 
2713   auto &Builder = State.Builder;
2714   State.setDebugLocFrom(getDebugLoc());
2715 
2716   Value *Mask = nullptr;
2717   if (auto *VPMask = getMask()) {
2718     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2719     // of a null all-one mask is a null mask.
2720     Mask = State.get(VPMask);
2721     if (isReverse())
2722       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2723   }
2724 
2725   Value *StoredVal = State.get(StoredVPValue);
2726   if (isReverse()) {
2727     // If we store to reverse consecutive memory locations, then we need
2728     // to reverse the order of elements in the stored value.
2729     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2730     // We don't want to update the value in the map as it might be used in
2731     // another expression. So don't call resetVectorValue(StoredVal).
2732   }
2733   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2734   Instruction *NewSI = nullptr;
2735   if (CreateScatter)
2736     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2737   else if (Mask)
2738     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2739   else
2740     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2741   State.addMetadata(NewSI, SI);
2742 }
2743 
2744 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2745 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2746                                VPSlotTracker &SlotTracker) const {
2747   O << Indent << "WIDEN store ";
2748   printOperands(O, SlotTracker);
2749 }
2750 #endif
2751 
2752 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2753   auto *SI = cast<StoreInst>(&Ingredient);
2754 
2755   VPValue *StoredValue = getStoredValue();
2756   bool CreateScatter = !isConsecutive();
2757   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2758 
2759   auto &Builder = State.Builder;
2760   State.setDebugLocFrom(getDebugLoc());
2761 
2762   CallInst *NewSI = nullptr;
2763   Value *StoredVal = State.get(StoredValue);
2764   Value *EVL = State.get(getEVL(), VPLane(0));
2765   if (isReverse())
2766     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2767   Value *Mask = nullptr;
2768   if (VPValue *VPMask = getMask()) {
2769     Mask = State.get(VPMask);
2770     if (isReverse())
2771       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2772   } else {
2773     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2774   }
2775   Value *Addr = State.get(getAddr(), !CreateScatter);
2776   if (CreateScatter) {
2777     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2778                                     Intrinsic::vp_scatter,
2779                                     {StoredVal, Addr, Mask, EVL});
2780   } else {
2781     VectorBuilder VBuilder(Builder);
2782     VBuilder.setEVL(EVL).setMask(Mask);
2783     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2784         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2785         {StoredVal, Addr}));
2786   }
2787   NewSI->addParamAttr(
2788       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2789   State.addMetadata(NewSI, SI);
2790 }
2791 
2792 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2793                                                    VPCostContext &Ctx) const {
2794   if (!Consecutive || IsMasked)
2795     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2796 
2797   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2798   // here because the EVL recipes using EVL to replace the tail mask. But in the
2799   // legacy model, it will always calculate the cost of mask.
2800   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2801   // don't need to compare to the legacy cost model.
2802   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2803   const Align Alignment =
2804       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2805   unsigned AS =
2806       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2807   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2808       Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
2809   if (!Reverse)
2810     return Cost;
2811 
2812   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2813                                        cast<VectorType>(Ty), {}, Ctx.CostKind,
2814                                        0);
2815 }
2816 
2817 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2818 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2819                                   VPSlotTracker &SlotTracker) const {
2820   O << Indent << "WIDEN vp.store ";
2821   printOperands(O, SlotTracker);
2822 }
2823 #endif
2824 
2825 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2826                                      VectorType *DstVTy, const DataLayout &DL) {
2827   // Verify that V is a vector type with same number of elements as DstVTy.
2828   auto VF = DstVTy->getElementCount();
2829   auto *SrcVecTy = cast<VectorType>(V->getType());
2830   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2831   Type *SrcElemTy = SrcVecTy->getElementType();
2832   Type *DstElemTy = DstVTy->getElementType();
2833   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2834          "Vector elements must have same size");
2835 
2836   // Do a direct cast if element types are castable.
2837   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2838     return Builder.CreateBitOrPointerCast(V, DstVTy);
2839   }
2840   // V cannot be directly casted to desired vector type.
2841   // May happen when V is a floating point vector but DstVTy is a vector of
2842   // pointers or vice-versa. Handle this using a two-step bitcast using an
2843   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2844   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2845          "Only one type should be a pointer type");
2846   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2847          "Only one type should be a floating point type");
2848   Type *IntTy =
2849       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2850   auto *VecIntTy = VectorType::get(IntTy, VF);
2851   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2852   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2853 }
2854 
2855 /// Return a vector containing interleaved elements from multiple
2856 /// smaller input vectors.
2857 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2858                                 const Twine &Name) {
2859   unsigned Factor = Vals.size();
2860   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2861 
2862   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2863 #ifndef NDEBUG
2864   for (Value *Val : Vals)
2865     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2866 #endif
2867 
2868   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2869   // must use intrinsics to interleave.
2870   if (VecTy->isScalableTy()) {
2871     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2872     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2873                                    Vals,
2874                                    /*FMFSource=*/nullptr, Name);
2875   }
2876 
2877   // Fixed length. Start by concatenating all vectors into a wide vector.
2878   Value *WideVec = concatenateVectors(Builder, Vals);
2879 
2880   // Interleave the elements into the wide vector.
2881   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2882   return Builder.CreateShuffleVector(
2883       WideVec, createInterleaveMask(NumElts, Factor), Name);
2884 }
2885 
2886 // Try to vectorize the interleave group that \p Instr belongs to.
2887 //
2888 // E.g. Translate following interleaved load group (factor = 3):
2889 //   for (i = 0; i < N; i+=3) {
2890 //     R = Pic[i];             // Member of index 0
2891 //     G = Pic[i+1];           // Member of index 1
2892 //     B = Pic[i+2];           // Member of index 2
2893 //     ... // do something to R, G, B
2894 //   }
2895 // To:
2896 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2897 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2898 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2899 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2900 //
2901 // Or translate following interleaved store group (factor = 3):
2902 //   for (i = 0; i < N; i+=3) {
2903 //     ... do something to R, G, B
2904 //     Pic[i]   = R;           // Member of index 0
2905 //     Pic[i+1] = G;           // Member of index 1
2906 //     Pic[i+2] = B;           // Member of index 2
2907 //   }
2908 // To:
2909 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2910 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2911 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2912 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2913 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2914 void VPInterleaveRecipe::execute(VPTransformState &State) {
2915   assert(!State.Lane && "Interleave group being replicated.");
2916   const InterleaveGroup<Instruction> *Group = IG;
2917   Instruction *Instr = Group->getInsertPos();
2918 
2919   // Prepare for the vector type of the interleaved load/store.
2920   Type *ScalarTy = getLoadStoreType(Instr);
2921   unsigned InterleaveFactor = Group->getFactor();
2922   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2923 
2924   // TODO: extend the masked interleaved-group support to reversed access.
2925   VPValue *BlockInMask = getMask();
2926   assert((!BlockInMask || !Group->isReverse()) &&
2927          "Reversed masked interleave-group not supported.");
2928 
2929   VPValue *Addr = getAddr();
2930   Value *ResAddr = State.get(Addr, VPLane(0));
2931   if (auto *I = dyn_cast<Instruction>(ResAddr))
2932     State.setDebugLocFrom(I->getDebugLoc());
2933 
2934   // If the group is reverse, adjust the index to refer to the last vector lane
2935   // instead of the first. We adjust the index from the first vector lane,
2936   // rather than directly getting the pointer for lane VF - 1, because the
2937   // pointer operand of the interleaved access is supposed to be uniform.
2938   if (Group->isReverse()) {
2939     Value *RuntimeVF =
2940         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2941     Value *Index =
2942         State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2943     Index = State.Builder.CreateMul(Index,
2944                                     State.Builder.getInt32(Group->getFactor()));
2945     Index = State.Builder.CreateNeg(Index);
2946 
2947     bool InBounds = false;
2948     if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2949       InBounds = Gep->isInBounds();
2950     ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
2951   }
2952 
2953   State.setDebugLocFrom(Instr->getDebugLoc());
2954   Value *PoisonVec = PoisonValue::get(VecTy);
2955 
2956   auto CreateGroupMask = [&BlockInMask, &State,
2957                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2958     if (State.VF.isScalable()) {
2959       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2960       assert(InterleaveFactor == 2 &&
2961              "Unsupported deinterleave factor for scalable vectors");
2962       auto *ResBlockInMask = State.get(BlockInMask);
2963       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2964       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2965                                      State.VF.getKnownMinValue() * 2, true);
2966       return State.Builder.CreateIntrinsic(
2967           MaskTy, Intrinsic::vector_interleave2, Ops,
2968           /*FMFSource=*/nullptr, "interleaved.mask");
2969     }
2970 
2971     if (!BlockInMask)
2972       return MaskForGaps;
2973 
2974     Value *ResBlockInMask = State.get(BlockInMask);
2975     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2976         ResBlockInMask,
2977         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2978         "interleaved.mask");
2979     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2980                                                    ShuffledMask, MaskForGaps)
2981                        : ShuffledMask;
2982   };
2983 
2984   const DataLayout &DL = Instr->getDataLayout();
2985   // Vectorize the interleaved load group.
2986   if (isa<LoadInst>(Instr)) {
2987     Value *MaskForGaps = nullptr;
2988     if (NeedsMaskForGaps) {
2989       MaskForGaps = createBitMaskForGaps(State.Builder,
2990                                          State.VF.getKnownMinValue(), *Group);
2991       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2992     }
2993 
2994     Instruction *NewLoad;
2995     if (BlockInMask || MaskForGaps) {
2996       Value *GroupMask = CreateGroupMask(MaskForGaps);
2997       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2998                                                Group->getAlign(), GroupMask,
2999                                                PoisonVec, "wide.masked.vec");
3000     } else
3001       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3002                                                 Group->getAlign(), "wide.vec");
3003     Group->addMetadata(NewLoad);
3004 
3005     ArrayRef<VPValue *> VPDefs = definedValues();
3006     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3007     if (VecTy->isScalableTy()) {
3008       assert(InterleaveFactor == 2 &&
3009              "Unsupported deinterleave factor for scalable vectors");
3010 
3011         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3012         // so must use intrinsics to deinterleave.
3013       Value *DI = State.Builder.CreateIntrinsic(
3014           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
3015           /*FMFSource=*/nullptr, "strided.vec");
3016       unsigned J = 0;
3017       for (unsigned I = 0; I < InterleaveFactor; ++I) {
3018         Instruction *Member = Group->getMember(I);
3019 
3020         if (!Member)
3021           continue;
3022 
3023         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
3024         // If this member has different type, cast the result type.
3025         if (Member->getType() != ScalarTy) {
3026           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3027           StridedVec =
3028               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3029         }
3030 
3031         if (Group->isReverse())
3032           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3033 
3034         State.set(VPDefs[J], StridedVec);
3035         ++J;
3036       }
3037 
3038       return;
3039     }
3040 
3041     // For each member in the group, shuffle out the appropriate data from the
3042     // wide loads.
3043     unsigned J = 0;
3044     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3045       Instruction *Member = Group->getMember(I);
3046 
3047       // Skip the gaps in the group.
3048       if (!Member)
3049         continue;
3050 
3051       auto StrideMask =
3052           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
3053       Value *StridedVec =
3054           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3055 
3056       // If this member has different type, cast the result type.
3057       if (Member->getType() != ScalarTy) {
3058         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3059         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3060         StridedVec =
3061             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3062       }
3063 
3064       if (Group->isReverse())
3065         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3066 
3067       State.set(VPDefs[J], StridedVec);
3068       ++J;
3069     }
3070     return;
3071   }
3072 
3073   // The sub vector type for current instruction.
3074   auto *SubVT = VectorType::get(ScalarTy, State.VF);
3075 
3076   // Vectorize the interleaved store group.
3077   Value *MaskForGaps =
3078       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3079   assert((!MaskForGaps || !State.VF.isScalable()) &&
3080          "masking gaps for scalable vectors is not yet supported.");
3081   ArrayRef<VPValue *> StoredValues = getStoredValues();
3082   // Collect the stored vector from each member.
3083   SmallVector<Value *, 4> StoredVecs;
3084   unsigned StoredIdx = 0;
3085   for (unsigned i = 0; i < InterleaveFactor; i++) {
3086     assert((Group->getMember(i) || MaskForGaps) &&
3087            "Fail to get a member from an interleaved store group");
3088     Instruction *Member = Group->getMember(i);
3089 
3090     // Skip the gaps in the group.
3091     if (!Member) {
3092       Value *Undef = PoisonValue::get(SubVT);
3093       StoredVecs.push_back(Undef);
3094       continue;
3095     }
3096 
3097     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3098     ++StoredIdx;
3099 
3100     if (Group->isReverse())
3101       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3102 
3103     // If this member has different type, cast it to a unified type.
3104 
3105     if (StoredVec->getType() != SubVT)
3106       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3107 
3108     StoredVecs.push_back(StoredVec);
3109   }
3110 
3111   // Interleave all the smaller vectors into one wider vector.
3112   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3113   Instruction *NewStoreInstr;
3114   if (BlockInMask || MaskForGaps) {
3115     Value *GroupMask = CreateGroupMask(MaskForGaps);
3116     NewStoreInstr = State.Builder.CreateMaskedStore(
3117         IVec, ResAddr, Group->getAlign(), GroupMask);
3118   } else
3119     NewStoreInstr =
3120         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3121 
3122   Group->addMetadata(NewStoreInstr);
3123 }
3124 
3125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3126 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3127                                VPSlotTracker &SlotTracker) const {
3128   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3129   IG->getInsertPos()->printAsOperand(O, false);
3130   O << ", ";
3131   getAddr()->printAsOperand(O, SlotTracker);
3132   VPValue *Mask = getMask();
3133   if (Mask) {
3134     O << ", ";
3135     Mask->printAsOperand(O, SlotTracker);
3136   }
3137 
3138   unsigned OpIdx = 0;
3139   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3140     if (!IG->getMember(i))
3141       continue;
3142     if (getNumStoreOperands() > 0) {
3143       O << "\n" << Indent << "  store ";
3144       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3145       O << " to index " << i;
3146     } else {
3147       O << "\n" << Indent << "  ";
3148       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3149       O << " = load from index " << i;
3150     }
3151     ++OpIdx;
3152   }
3153 }
3154 #endif
3155 
3156 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3157                                                 VPCostContext &Ctx) const {
3158   Instruction *InsertPos = getInsertPos();
3159   // Find the VPValue index of the interleave group. We need to skip gaps.
3160   unsigned InsertPosIdx = 0;
3161   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3162     if (auto *Member = IG->getMember(Idx)) {
3163       if (Member == InsertPos)
3164         break;
3165       InsertPosIdx++;
3166     }
3167   Type *ValTy = Ctx.Types.inferScalarType(
3168       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3169                                 : getStoredValues()[InsertPosIdx]);
3170   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3171   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3172 
3173   unsigned InterleaveFactor = IG->getFactor();
3174   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3175 
3176   // Holds the indices of existing members in the interleaved group.
3177   SmallVector<unsigned, 4> Indices;
3178   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3179     if (IG->getMember(IF))
3180       Indices.push_back(IF);
3181 
3182   // Calculate the cost of the whole interleaved group.
3183   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3184       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3185       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
3186 
3187   if (!IG->isReverse())
3188     return Cost;
3189 
3190   return Cost + IG->getNumMembers() *
3191                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3192                                            VectorTy, std::nullopt, Ctx.CostKind,
3193                                            0);
3194 }
3195 
3196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3197 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3198                                    VPSlotTracker &SlotTracker) const {
3199   O << Indent << "EMIT ";
3200   printAsOperand(O, SlotTracker);
3201   O << " = CANONICAL-INDUCTION ";
3202   printOperands(O, SlotTracker);
3203 }
3204 #endif
3205 
3206 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3207   return IsScalarAfterVectorization &&
3208          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3209 }
3210 
3211 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3212   assert(getInductionDescriptor().getKind() ==
3213              InductionDescriptor::IK_PtrInduction &&
3214          "Not a pointer induction according to InductionDescriptor!");
3215   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
3216          "Unexpected type.");
3217   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3218          "Recipe should have been replaced");
3219 
3220   unsigned CurrentPart = getUnrollPart(*this);
3221 
3222   // Build a pointer phi
3223   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3224   Type *ScStValueType = ScalarStartValue->getType();
3225 
3226   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3227   PHINode *NewPointerPhi = nullptr;
3228   if (CurrentPart == 0) {
3229     auto *IVR = cast<VPHeaderPHIRecipe>(&getParent()
3230                                              ->getPlan()
3231                                              ->getVectorLoopRegion()
3232                                              ->getEntryBasicBlock()
3233                                              ->front());
3234     PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
3235     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
3236                                     CanonicalIV->getIterator());
3237     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3238     NewPointerPhi->setDebugLoc(getDebugLoc());
3239   } else {
3240     // The recipe has been unrolled. In that case, fetch the single pointer phi
3241     // shared among all unrolled parts of the recipe.
3242     auto *GEP =
3243         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3244     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3245   }
3246 
3247   // A pointer induction, performed by using a gep
3248   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3249   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3250   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3251   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3252   // Add induction update using an incorrect block temporarily. The phi node
3253   // will be fixed after VPlan execution. Note that at this point the latch
3254   // block cannot be used, as it does not exist yet.
3255   // TODO: Model increment value in VPlan, by turning the recipe into a
3256   // multi-def and a subclass of VPHeaderPHIRecipe.
3257   if (CurrentPart == 0) {
3258     // The recipe represents the first part of the pointer induction. Create the
3259     // GEP to increment the phi across all unrolled parts.
3260     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
3261     Value *NumUnrolledElems =
3262         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
3263 
3264     Value *InductionGEP = GetElementPtrInst::Create(
3265         State.Builder.getInt8Ty(), NewPointerPhi,
3266         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
3267         InductionLoc);
3268 
3269     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3270   }
3271 
3272   // Create actual address geps that use the pointer phi as base and a
3273   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3274   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3275   Value *StartOffsetScalar = State.Builder.CreateMul(
3276       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3277   Value *StartOffset =
3278       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3279   // Create a vector of consecutive numbers from zero to VF.
3280   StartOffset = State.Builder.CreateAdd(
3281       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3282 
3283   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3284          "scalar step must be the same across all parts");
3285   Value *GEP = State.Builder.CreateGEP(
3286       State.Builder.getInt8Ty(), NewPointerPhi,
3287       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3288                                                State.VF, ScalarStepValue)),
3289       "vector.gep");
3290   State.set(this, GEP);
3291 }
3292 
3293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3294 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3295                                           VPSlotTracker &SlotTracker) const {
3296   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
3297          "unexpected number of operands");
3298   O << Indent << "EMIT ";
3299   printAsOperand(O, SlotTracker);
3300   O << " = WIDEN-POINTER-INDUCTION ";
3301   getStartValue()->printAsOperand(O, SlotTracker);
3302   O << ", ";
3303   getStepValue()->printAsOperand(O, SlotTracker);
3304   if (getNumOperands() == 4) {
3305     O << ", ";
3306     getOperand(2)->printAsOperand(O, SlotTracker);
3307     O << ", ";
3308     getOperand(3)->printAsOperand(O, SlotTracker);
3309   }
3310 }
3311 #endif
3312 
3313 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3314   assert(!State.Lane && "cannot be used in per-lane");
3315   if (State.ExpandedSCEVs.contains(Expr)) {
3316     // SCEV Expr has already been expanded, result must already be set. At the
3317     // moment we have to execute the entry block twice (once before skeleton
3318     // creation to get expanded SCEVs used by the skeleton and once during
3319     // regular VPlan execution).
3320     State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
3321     assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
3322            "Results must match");
3323     return;
3324   }
3325 
3326   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3327   SCEVExpander Exp(SE, DL, "induction");
3328 
3329   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3330                                  &*State.Builder.GetInsertPoint());
3331   State.ExpandedSCEVs[Expr] = Res;
3332   State.set(this, Res, VPLane(0));
3333 }
3334 
3335 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3336 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3337                                VPSlotTracker &SlotTracker) const {
3338   O << Indent << "EMIT ";
3339   printAsOperand(O, SlotTracker);
3340   O << " = EXPAND SCEV " << *Expr;
3341 }
3342 #endif
3343 
3344 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3345   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3346   Type *STy = CanonicalIV->getType();
3347   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3348   ElementCount VF = State.VF;
3349   Value *VStart = VF.isScalar()
3350                       ? CanonicalIV
3351                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3352   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3353   if (VF.isVector()) {
3354     VStep = Builder.CreateVectorSplat(VF, VStep);
3355     VStep =
3356         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3357   }
3358   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3359   State.set(this, CanonicalVectorIV);
3360 }
3361 
3362 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3363 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3364                                      VPSlotTracker &SlotTracker) const {
3365   O << Indent << "EMIT ";
3366   printAsOperand(O, SlotTracker);
3367   O << " = WIDEN-CANONICAL-INDUCTION ";
3368   printOperands(O, SlotTracker);
3369 }
3370 #endif
3371 
3372 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3373   auto &Builder = State.Builder;
3374   // Create a vector from the initial value.
3375   auto *VectorInit = getStartValue()->getLiveInIRValue();
3376 
3377   Type *VecTy = State.VF.isScalar()
3378                     ? VectorInit->getType()
3379                     : VectorType::get(VectorInit->getType(), State.VF);
3380 
3381   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3382   if (State.VF.isVector()) {
3383     auto *IdxTy = Builder.getInt32Ty();
3384     auto *One = ConstantInt::get(IdxTy, 1);
3385     IRBuilder<>::InsertPointGuard Guard(Builder);
3386     Builder.SetInsertPoint(VectorPH->getTerminator());
3387     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3388     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3389     VectorInit = Builder.CreateInsertElement(
3390         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3391   }
3392 
3393   // Create a phi node for the new recurrence.
3394   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3395   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3396   Phi->addIncoming(VectorInit, VectorPH);
3397   State.set(this, Phi);
3398 }
3399 
3400 InstructionCost
3401 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3402                                              VPCostContext &Ctx) const {
3403   if (VF.isScalar())
3404     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3405 
3406   if (VF.isScalable() && VF.getKnownMinValue() == 1)
3407     return InstructionCost::getInvalid();
3408 
3409   SmallVector<int> Mask(VF.getKnownMinValue());
3410   std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
3411   Type *VectorTy =
3412       toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
3413 
3414   return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
3415                                 cast<VectorType>(VectorTy), Mask, Ctx.CostKind,
3416                                 VF.getKnownMinValue() - 1);
3417 }
3418 
3419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3420 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3421                                             VPSlotTracker &SlotTracker) const {
3422   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3423   printAsOperand(O, SlotTracker);
3424   O << " = phi ";
3425   printOperands(O, SlotTracker);
3426 }
3427 #endif
3428 
3429 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3430   auto &Builder = State.Builder;
3431 
3432   // If this phi is fed by a scaled reduction then it should output a
3433   // vector with fewer elements than the VF.
3434   ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
3435 
3436   // Reductions do not have to start at zero. They can start with
3437   // any loop invariant values.
3438   VPValue *StartVPV = getStartValue();
3439   Value *StartV = StartVPV->getLiveInIRValue();
3440 
3441   // In order to support recurrences we need to be able to vectorize Phi nodes.
3442   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3443   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3444   // this value when we vectorize all of the instructions that use the PHI.
3445   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3446   Type *VecTy =
3447       ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
3448 
3449   BasicBlock *HeaderBB = State.CFG.PrevBB;
3450   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3451          "recipe must be in the vector loop header");
3452   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3453   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3454   State.set(this, Phi, IsInLoop);
3455 
3456   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3457 
3458   Value *Iden = nullptr;
3459   RecurKind RK = RdxDesc.getRecurrenceKind();
3460   unsigned CurrentPart = getUnrollPart(*this);
3461 
3462   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3463       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3464     // MinMax and AnyOf reductions have the start value as their identity.
3465     if (ScalarPHI) {
3466       Iden = StartV;
3467     } else {
3468       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3469       Builder.SetInsertPoint(VectorPH->getTerminator());
3470       StartV = Iden = State.get(StartVPV);
3471     }
3472   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
3473     // [I|F]FindLastIV will use a sentinel value to initialize the reduction
3474     // phi or the resume value from the main vector loop when vectorizing the
3475     // epilogue loop. In the exit block, ComputeReductionResult will generate
3476     // checks to verify if the reduction result is the sentinel value. If the
3477     // result is the sentinel value, it will be corrected back to the start
3478     // value.
3479     // TODO: The sentinel value is not always necessary. When the start value is
3480     // a constant, and smaller than the start value of the induction variable,
3481     // the start value can be directly used to initialize the reduction phi.
3482     Iden = StartV;
3483     if (!ScalarPHI) {
3484       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3485       Builder.SetInsertPoint(VectorPH->getTerminator());
3486       StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
3487     }
3488   } else {
3489     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3490                                        RdxDesc.getFastMathFlags());
3491 
3492     if (!ScalarPHI) {
3493       if (CurrentPart == 0) {
3494         // Create start and identity vector values for the reduction in the
3495         // preheader.
3496         // TODO: Introduce recipes in VPlan preheader to create initial values.
3497         Iden = Builder.CreateVectorSplat(VF, Iden);
3498         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3499         Builder.SetInsertPoint(VectorPH->getTerminator());
3500         Constant *Zero = Builder.getInt32(0);
3501         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3502       } else {
3503         Iden = Builder.CreateVectorSplat(VF, Iden);
3504       }
3505     }
3506   }
3507 
3508   Phi = cast<PHINode>(State.get(this, IsInLoop));
3509   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3510   Phi->addIncoming(StartVal, VectorPH);
3511 }
3512 
3513 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3514 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3515                                  VPSlotTracker &SlotTracker) const {
3516   O << Indent << "WIDEN-REDUCTION-PHI ";
3517 
3518   printAsOperand(O, SlotTracker);
3519   O << " = phi ";
3520   printOperands(O, SlotTracker);
3521   if (VFScaleFactor != 1)
3522     O << " (VF scaled by 1/" << VFScaleFactor << ")";
3523 }
3524 #endif
3525 
3526 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3527   assert(EnableVPlanNativePath &&
3528          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3529 
3530   State.setDebugLocFrom(getDebugLoc());
3531   Value *Op0 = State.get(getOperand(0));
3532   Type *VecTy = Op0->getType();
3533   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3534   State.set(this, VecPhi);
3535 }
3536 
3537 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3538 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3539                              VPSlotTracker &SlotTracker) const {
3540   O << Indent << "WIDEN-PHI ";
3541 
3542   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3543   // Unless all incoming values are modeled in VPlan  print the original PHI
3544   // directly.
3545   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3546   // values as VPValues.
3547   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3548     O << VPlanIngredient(OriginalPhi);
3549     return;
3550   }
3551 
3552   printAsOperand(O, SlotTracker);
3553   O << " = phi ";
3554   printOperands(O, SlotTracker);
3555 }
3556 #endif
3557 
3558 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3559 // remove VPActiveLaneMaskPHIRecipe.
3560 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3561   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3562   Value *StartMask = State.get(getOperand(0));
3563   PHINode *Phi =
3564       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3565   Phi->addIncoming(StartMask, VectorPH);
3566   Phi->setDebugLoc(getDebugLoc());
3567   State.set(this, Phi);
3568 }
3569 
3570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3571 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3572                                       VPSlotTracker &SlotTracker) const {
3573   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3574 
3575   printAsOperand(O, SlotTracker);
3576   O << " = phi ";
3577   printOperands(O, SlotTracker);
3578 }
3579 #endif
3580 
3581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3582 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3583                                   VPSlotTracker &SlotTracker) const {
3584   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3585 
3586   printAsOperand(O, SlotTracker);
3587   O << " = phi ";
3588   printOperands(O, SlotTracker);
3589 }
3590 #endif
3591 
3592 void VPScalarPHIRecipe::execute(VPTransformState &State) {
3593   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3594   Value *Start = State.get(getStartValue(), VPLane(0));
3595   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
3596   Phi->addIncoming(Start, VectorPH);
3597   Phi->setDebugLoc(getDebugLoc());
3598   State.set(this, Phi, /*IsScalar=*/true);
3599 }
3600 
3601 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3602 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3603                               VPSlotTracker &SlotTracker) const {
3604   O << Indent << "SCALAR-PHI ";
3605   printAsOperand(O, SlotTracker);
3606   O << " = phi ";
3607   printOperands(O, SlotTracker);
3608 }
3609 #endif
3610