xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision c858bf620c3ab2a4db53e84b9365b553c3ad1aa6)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "VPlanPatternMatch.h"
17 #include "VPlanUtils.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/IVDescriptors.h"
22 #include "llvm/IR/BasicBlock.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instruction.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/IR/VectorBuilder.h"
30 #include "llvm/Support/Casting.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
37 #include <cassert>
38 
39 using namespace llvm;
40 
41 using VectorParts = SmallVector<Value *, 2>;
42 
43 namespace llvm {
44 extern cl::opt<bool> EnableVPlanNativePath;
45 }
46 extern cl::opt<unsigned> ForceTargetInstructionCost;
47 
48 #define LV_NAME "loop-vectorize"
49 #define DEBUG_TYPE LV_NAME
50 
51 bool VPRecipeBase::mayWriteToMemory() const {
52   switch (getVPDefID()) {
53   case VPInstructionSC:
54     if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
55       return false;
56     switch (cast<VPInstruction>(this)->getOpcode()) {
57     case Instruction::Or:
58     case Instruction::ICmp:
59     case Instruction::Select:
60     case VPInstruction::AnyOf:
61     case VPInstruction::Not:
62     case VPInstruction::CalculateTripCountMinusVF:
63     case VPInstruction::CanonicalIVIncrementForPart:
64     case VPInstruction::ExtractFromEnd:
65     case VPInstruction::FirstOrderRecurrenceSplice:
66     case VPInstruction::LogicalAnd:
67     case VPInstruction::PtrAdd:
68       return false;
69     default:
70       return true;
71     }
72   case VPInterleaveSC:
73     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
74   case VPWidenStoreEVLSC:
75   case VPWidenStoreSC:
76     return true;
77   case VPReplicateSC:
78     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
79         ->mayWriteToMemory();
80   case VPWidenCallSC:
81     return !cast<VPWidenCallRecipe>(this)
82                 ->getCalledScalarFunction()
83                 ->onlyReadsMemory();
84   case VPWidenIntrinsicSC:
85     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
86   case VPBranchOnMaskSC:
87   case VPScalarIVStepsSC:
88   case VPPredInstPHISC:
89     return false;
90   case VPBlendSC:
91   case VPReductionEVLSC:
92   case VPReductionSC:
93   case VPVectorPointerSC:
94   case VPWidenCanonicalIVSC:
95   case VPWidenCastSC:
96   case VPWidenGEPSC:
97   case VPWidenIntOrFpInductionSC:
98   case VPWidenLoadEVLSC:
99   case VPWidenLoadSC:
100   case VPWidenPHISC:
101   case VPWidenSC:
102   case VPWidenEVLSC:
103   case VPWidenSelectSC: {
104     const Instruction *I =
105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106     (void)I;
107     assert((!I || !I->mayWriteToMemory()) &&
108            "underlying instruction may write to memory");
109     return false;
110   }
111   default:
112     return true;
113   }
114 }
115 
116 bool VPRecipeBase::mayReadFromMemory() const {
117   switch (getVPDefID()) {
118   case VPWidenLoadEVLSC:
119   case VPWidenLoadSC:
120     return true;
121   case VPReplicateSC:
122     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
123         ->mayReadFromMemory();
124   case VPWidenCallSC:
125     return !cast<VPWidenCallRecipe>(this)
126                 ->getCalledScalarFunction()
127                 ->onlyWritesMemory();
128   case VPWidenIntrinsicSC:
129     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
130   case VPBranchOnMaskSC:
131   case VPPredInstPHISC:
132   case VPScalarIVStepsSC:
133   case VPWidenStoreEVLSC:
134   case VPWidenStoreSC:
135     return false;
136   case VPBlendSC:
137   case VPReductionEVLSC:
138   case VPReductionSC:
139   case VPVectorPointerSC:
140   case VPWidenCanonicalIVSC:
141   case VPWidenCastSC:
142   case VPWidenGEPSC:
143   case VPWidenIntOrFpInductionSC:
144   case VPWidenPHISC:
145   case VPWidenSC:
146   case VPWidenEVLSC:
147   case VPWidenSelectSC: {
148     const Instruction *I =
149         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
150     (void)I;
151     assert((!I || !I->mayReadFromMemory()) &&
152            "underlying instruction may read from memory");
153     return false;
154   }
155   default:
156     return true;
157   }
158 }
159 
160 bool VPRecipeBase::mayHaveSideEffects() const {
161   switch (getVPDefID()) {
162   case VPDerivedIVSC:
163   case VPPredInstPHISC:
164   case VPScalarCastSC:
165   case VPReverseVectorPointerSC:
166     return false;
167   case VPInstructionSC:
168     return mayWriteToMemory();
169   case VPWidenCallSC: {
170     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
171     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
172   }
173   case VPWidenIntrinsicSC:
174     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
175   case VPBlendSC:
176   case VPReductionEVLSC:
177   case VPReductionSC:
178   case VPScalarIVStepsSC:
179   case VPVectorPointerSC:
180   case VPWidenCanonicalIVSC:
181   case VPWidenCastSC:
182   case VPWidenGEPSC:
183   case VPWidenIntOrFpInductionSC:
184   case VPWidenPHISC:
185   case VPWidenPointerInductionSC:
186   case VPWidenSC:
187   case VPWidenEVLSC:
188   case VPWidenSelectSC: {
189     const Instruction *I =
190         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
191     (void)I;
192     assert((!I || !I->mayHaveSideEffects()) &&
193            "underlying instruction has side-effects");
194     return false;
195   }
196   case VPInterleaveSC:
197     return mayWriteToMemory();
198   case VPWidenLoadEVLSC:
199   case VPWidenLoadSC:
200   case VPWidenStoreEVLSC:
201   case VPWidenStoreSC:
202     assert(
203         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
204             mayWriteToMemory() &&
205         "mayHaveSideffects result for ingredient differs from this "
206         "implementation");
207     return mayWriteToMemory();
208   case VPReplicateSC: {
209     auto *R = cast<VPReplicateRecipe>(this);
210     return R->getUnderlyingInstr()->mayHaveSideEffects();
211   }
212   default:
213     return true;
214   }
215 }
216 
217 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
218   assert(!Parent && "Recipe already in some VPBasicBlock");
219   assert(InsertPos->getParent() &&
220          "Insertion position not in any VPBasicBlock");
221   InsertPos->getParent()->insert(this, InsertPos->getIterator());
222 }
223 
224 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
225                                 iplist<VPRecipeBase>::iterator I) {
226   assert(!Parent && "Recipe already in some VPBasicBlock");
227   assert(I == BB.end() || I->getParent() == &BB);
228   BB.insert(this, I);
229 }
230 
231 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
232   assert(!Parent && "Recipe already in some VPBasicBlock");
233   assert(InsertPos->getParent() &&
234          "Insertion position not in any VPBasicBlock");
235   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
236 }
237 
238 void VPRecipeBase::removeFromParent() {
239   assert(getParent() && "Recipe not in any VPBasicBlock");
240   getParent()->getRecipeList().remove(getIterator());
241   Parent = nullptr;
242 }
243 
244 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
245   assert(getParent() && "Recipe not in any VPBasicBlock");
246   return getParent()->getRecipeList().erase(getIterator());
247 }
248 
249 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
250   removeFromParent();
251   insertAfter(InsertPos);
252 }
253 
254 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
255                               iplist<VPRecipeBase>::iterator I) {
256   removeFromParent();
257   insertBefore(BB, I);
258 }
259 
260 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
261   // Get the underlying instruction for the recipe, if there is one. It is used
262   // to
263   //   * decide if cost computation should be skipped for this recipe,
264   //   * apply forced target instruction cost.
265   Instruction *UI = nullptr;
266   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
267     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
268   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
269     UI = IG->getInsertPos();
270   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
271     UI = &WidenMem->getIngredient();
272 
273   InstructionCost RecipeCost;
274   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
275     RecipeCost = 0;
276   } else {
277     RecipeCost = computeCost(VF, Ctx);
278     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
279         RecipeCost.isValid())
280       RecipeCost = InstructionCost(ForceTargetInstructionCost);
281   }
282 
283   LLVM_DEBUG({
284     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
285     dump();
286   });
287   return RecipeCost;
288 }
289 
290 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
291                                           VPCostContext &Ctx) const {
292   llvm_unreachable("subclasses should implement computeCost");
293 }
294 
295 InstructionCost
296 VPPartialReductionRecipe::computeCost(ElementCount VF,
297                                       VPCostContext &Ctx) const {
298   std::optional<unsigned> Opcode = std::nullopt;
299   VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
300   if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
301     Opcode = std::make_optional(WidenR->getOpcode());
302 
303   VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
304   VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
305 
306   auto GetExtendKind = [](VPRecipeBase *R) {
307     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
308     if (!WidenCastR)
309       return TargetTransformInfo::PR_None;
310     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
311       return TargetTransformInfo::PR_ZeroExtend;
312     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
313       return TargetTransformInfo::PR_SignExtend;
314     return TargetTransformInfo::PR_None;
315   };
316 
317   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
318   auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
319 
320   return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF,
321                                          GetExtendKind(ExtAR),
322                                          GetExtendKind(ExtBR), Opcode);
323 }
324 
325 void VPPartialReductionRecipe::execute(VPTransformState &State) {
326   State.setDebugLocFrom(getDebugLoc());
327   auto &Builder = State.Builder;
328 
329   assert(getOpcode() == Instruction::Add &&
330          "Unhandled partial reduction opcode");
331 
332   Value *BinOpVal = State.get(getOperand(0));
333   Value *PhiVal = State.get(getOperand(1));
334   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
335 
336   Type *RetTy = PhiVal->getType();
337 
338   CallInst *V = Builder.CreateIntrinsic(
339       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
340       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
341 
342   State.set(this, V);
343 }
344 
345 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
346 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
347                                      VPSlotTracker &SlotTracker) const {
348   O << Indent << "PARTIAL-REDUCE ";
349   printAsOperand(O, SlotTracker);
350   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
351   printOperands(O, SlotTracker);
352 }
353 #endif
354 
355 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
356   assert(OpType == OperationType::FPMathOp &&
357          "recipe doesn't have fast math flags");
358   FastMathFlags Res;
359   Res.setAllowReassoc(FMFs.AllowReassoc);
360   Res.setNoNaNs(FMFs.NoNaNs);
361   Res.setNoInfs(FMFs.NoInfs);
362   Res.setNoSignedZeros(FMFs.NoSignedZeros);
363   Res.setAllowReciprocal(FMFs.AllowReciprocal);
364   Res.setAllowContract(FMFs.AllowContract);
365   Res.setApproxFunc(FMFs.ApproxFunc);
366   return Res;
367 }
368 
369 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
370 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
371 #endif
372 
373 template <unsigned PartOpIdx>
374 VPValue *
375 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
376   if (U.getNumOperands() == PartOpIdx + 1)
377     return U.getOperand(PartOpIdx);
378   return nullptr;
379 }
380 
381 template <unsigned PartOpIdx>
382 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
383   if (auto *UnrollPartOp = getUnrollPartOperand(U))
384     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
385   return 0;
386 }
387 
388 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
389                              VPValue *A, VPValue *B, DebugLoc DL,
390                              const Twine &Name)
391     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
392                           Pred, DL),
393       Opcode(Opcode), Name(Name.str()) {
394   assert(Opcode == Instruction::ICmp &&
395          "only ICmp predicates supported at the moment");
396 }
397 
398 VPInstruction::VPInstruction(unsigned Opcode,
399                              std::initializer_list<VPValue *> Operands,
400                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
401     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
402       Opcode(Opcode), Name(Name.str()) {
403   // Make sure the VPInstruction is a floating-point operation.
404   assert(isFPMathOp() && "this op can't take fast-math flags");
405 }
406 
407 bool VPInstruction::doesGeneratePerAllLanes() const {
408   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
409 }
410 
411 bool VPInstruction::canGenerateScalarForFirstLane() const {
412   if (Instruction::isBinaryOp(getOpcode()))
413     return true;
414   if (isSingleScalar() || isVectorToScalar())
415     return true;
416   switch (Opcode) {
417   case Instruction::ICmp:
418   case Instruction::Select:
419   case VPInstruction::BranchOnCond:
420   case VPInstruction::BranchOnCount:
421   case VPInstruction::CalculateTripCountMinusVF:
422   case VPInstruction::CanonicalIVIncrementForPart:
423   case VPInstruction::PtrAdd:
424   case VPInstruction::ExplicitVectorLength:
425   case VPInstruction::AnyOf:
426     return true;
427   default:
428     return false;
429   }
430 }
431 
432 Value *VPInstruction::generatePerLane(VPTransformState &State,
433                                       const VPLane &Lane) {
434   IRBuilderBase &Builder = State.Builder;
435 
436   assert(getOpcode() == VPInstruction::PtrAdd &&
437          "only PtrAdd opcodes are supported for now");
438   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
439                               State.get(getOperand(1), Lane), Name);
440 }
441 
442 Value *VPInstruction::generate(VPTransformState &State) {
443   IRBuilderBase &Builder = State.Builder;
444 
445   if (Instruction::isBinaryOp(getOpcode())) {
446     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
447     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
448     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
449     auto *Res =
450         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
451     if (auto *I = dyn_cast<Instruction>(Res))
452       setFlags(I);
453     return Res;
454   }
455 
456   switch (getOpcode()) {
457   case VPInstruction::Not: {
458     Value *A = State.get(getOperand(0));
459     return Builder.CreateNot(A, Name);
460   }
461   case Instruction::ICmp: {
462     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
463     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
464     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
465     return Builder.CreateCmp(getPredicate(), A, B, Name);
466   }
467   case Instruction::Select: {
468     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
469     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
470     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
471     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
472     return Builder.CreateSelect(Cond, Op1, Op2, Name);
473   }
474   case VPInstruction::ActiveLaneMask: {
475     // Get first lane of vector induction variable.
476     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
477     // Get the original loop tripcount.
478     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
479 
480     // If this part of the active lane mask is scalar, generate the CMP directly
481     // to avoid unnecessary extracts.
482     if (State.VF.isScalar())
483       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
484                                Name);
485 
486     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
487     auto *PredTy = VectorType::get(Int1Ty, State.VF);
488     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
489                                    {PredTy, ScalarTC->getType()},
490                                    {VIVElem0, ScalarTC}, nullptr, Name);
491   }
492   case VPInstruction::FirstOrderRecurrenceSplice: {
493     // Generate code to combine the previous and current values in vector v3.
494     //
495     //   vector.ph:
496     //     v_init = vector(..., ..., ..., a[-1])
497     //     br vector.body
498     //
499     //   vector.body
500     //     i = phi [0, vector.ph], [i+4, vector.body]
501     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
502     //     v2 = a[i, i+1, i+2, i+3];
503     //     v3 = vector(v1(3), v2(0, 1, 2))
504 
505     auto *V1 = State.get(getOperand(0));
506     if (!V1->getType()->isVectorTy())
507       return V1;
508     Value *V2 = State.get(getOperand(1));
509     return Builder.CreateVectorSplice(V1, V2, -1, Name);
510   }
511   case VPInstruction::CalculateTripCountMinusVF: {
512     unsigned UF = getParent()->getPlan()->getUF();
513     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
514     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
515     Value *Sub = Builder.CreateSub(ScalarTC, Step);
516     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
517     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
518     return Builder.CreateSelect(Cmp, Sub, Zero);
519   }
520   case VPInstruction::ExplicitVectorLength: {
521     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
522     // be outside of the main loop.
523     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
524     // Compute EVL
525     assert(AVL->getType()->isIntegerTy() &&
526            "Requested vector length should be an integer.");
527 
528     assert(State.VF.isScalable() && "Expected scalable vector factor.");
529     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
530 
531     Value *EVL = State.Builder.CreateIntrinsic(
532         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
533         {AVL, VFArg, State.Builder.getTrue()});
534     return EVL;
535   }
536   case VPInstruction::CanonicalIVIncrementForPart: {
537     unsigned Part = getUnrollPart(*this);
538     auto *IV = State.get(getOperand(0), VPLane(0));
539     assert(Part != 0 && "Must have a positive part");
540     // The canonical IV is incremented by the vectorization factor (num of
541     // SIMD elements) times the unroll part.
542     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
543     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
544                              hasNoSignedWrap());
545   }
546   case VPInstruction::BranchOnCond: {
547     Value *Cond = State.get(getOperand(0), VPLane(0));
548     // Replace the temporary unreachable terminator with a new conditional
549     // branch, hooking it up to backward destination for exiting blocks now and
550     // to forward destination(s) later when they are created.
551     BranchInst *CondBr =
552         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
553     CondBr->setSuccessor(0, nullptr);
554     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
555 
556     if (!getParent()->isExiting())
557       return CondBr;
558 
559     VPRegionBlock *ParentRegion = getParent()->getParent();
560     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
561     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
562     return CondBr;
563   }
564   case VPInstruction::BranchOnCount: {
565     // First create the compare.
566     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
567     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
568     Value *Cond = Builder.CreateICmpEQ(IV, TC);
569 
570     // Now create the branch.
571     auto *Plan = getParent()->getPlan();
572     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
573     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
574 
575     // Replace the temporary unreachable terminator with a new conditional
576     // branch, hooking it up to backward destination (the header) now and to the
577     // forward destination (the exit/middle block) later when it is created.
578     // Note that CreateCondBr expects a valid BB as first argument, so we need
579     // to set it to nullptr later.
580     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
581                                               State.CFG.VPBB2IRBB[Header]);
582     CondBr->setSuccessor(0, nullptr);
583     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
584     return CondBr;
585   }
586   case VPInstruction::ComputeReductionResult: {
587     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
588     // and will be removed by breaking up the recipe further.
589     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
590     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
591     // Get its reduction variable descriptor.
592     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
593 
594     RecurKind RK = RdxDesc.getRecurrenceKind();
595 
596     Type *PhiTy = OrigPhi->getType();
597     // The recipe's operands are the reduction phi, followed by one operand for
598     // each part of the reduction.
599     unsigned UF = getNumOperands() - 1;
600     VectorParts RdxParts(UF);
601     for (unsigned Part = 0; Part < UF; ++Part)
602       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
603 
604     // If the vector reduction can be performed in a smaller type, we truncate
605     // then extend the loop exit value to enable InstCombine to evaluate the
606     // entire expression in the smaller type.
607     // TODO: Handle this in truncateToMinBW.
608     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
609       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
610       for (unsigned Part = 0; Part < UF; ++Part)
611         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
612     }
613     // Reduce all of the unrolled parts into a single vector.
614     Value *ReducedPartRdx = RdxParts[0];
615     unsigned Op = RdxDesc.getOpcode();
616     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
617       Op = Instruction::Or;
618 
619     if (PhiR->isOrdered()) {
620       ReducedPartRdx = RdxParts[UF - 1];
621     } else {
622       // Floating-point operations should have some FMF to enable the reduction.
623       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
624       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
625       for (unsigned Part = 1; Part < UF; ++Part) {
626         Value *RdxPart = RdxParts[Part];
627         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
628           ReducedPartRdx = Builder.CreateBinOp(
629               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
630         else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
631           ReducedPartRdx =
632               createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
633         else
634           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
635       }
636     }
637 
638     // Create the reduction after the loop. Note that inloop reductions create
639     // the target reduction in the loop using a Reduction recipe.
640     if ((State.VF.isVector() ||
641          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
642          RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
643         !PhiR->isInLoop()) {
644       ReducedPartRdx =
645           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
646       // If the reduction can be performed in a smaller type, we need to extend
647       // the reduction to the wider type before we branch to the original loop.
648       if (PhiTy != RdxDesc.getRecurrenceType())
649         ReducedPartRdx = RdxDesc.isSigned()
650                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
651                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
652     }
653 
654     return ReducedPartRdx;
655   }
656   case VPInstruction::ExtractFromEnd: {
657     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
658     unsigned Offset = CI->getZExtValue();
659     assert(Offset > 0 && "Offset from end must be positive");
660     Value *Res;
661     if (State.VF.isVector()) {
662       assert(Offset <= State.VF.getKnownMinValue() &&
663              "invalid offset to extract from");
664       // Extract lane VF - Offset from the operand.
665       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
666     } else {
667       assert(Offset <= 1 && "invalid offset to extract from");
668       Res = State.get(getOperand(0));
669     }
670     if (isa<ExtractElementInst>(Res))
671       Res->setName(Name);
672     return Res;
673   }
674   case VPInstruction::LogicalAnd: {
675     Value *A = State.get(getOperand(0));
676     Value *B = State.get(getOperand(1));
677     return Builder.CreateLogicalAnd(A, B, Name);
678   }
679   case VPInstruction::PtrAdd: {
680     assert(vputils::onlyFirstLaneUsed(this) &&
681            "can only generate first lane for PtrAdd");
682     Value *Ptr = State.get(getOperand(0), VPLane(0));
683     Value *Addend = State.get(getOperand(1), VPLane(0));
684     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
685   }
686   case VPInstruction::ResumePhi: {
687     Value *IncomingFromVPlanPred =
688         State.get(getOperand(0), /* IsScalar */ true);
689     Value *IncomingFromOtherPreds =
690         State.get(getOperand(1), /* IsScalar */ true);
691     auto *NewPhi =
692         Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
693     BasicBlock *VPlanPred =
694         State.CFG
695             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
696     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
697     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
698       if (OtherPred == VPlanPred)
699         continue;
700       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
701     }
702     return NewPhi;
703   }
704   case VPInstruction::AnyOf: {
705     Value *A = State.get(getOperand(0));
706     return Builder.CreateOrReduce(A);
707   }
708 
709   default:
710     llvm_unreachable("Unsupported opcode for instruction");
711   }
712 }
713 
714 bool VPInstruction::isVectorToScalar() const {
715   return getOpcode() == VPInstruction::ExtractFromEnd ||
716          getOpcode() == VPInstruction::ComputeReductionResult ||
717          getOpcode() == VPInstruction::AnyOf;
718 }
719 
720 bool VPInstruction::isSingleScalar() const {
721   return getOpcode() == VPInstruction::ResumePhi;
722 }
723 
724 #if !defined(NDEBUG)
725 bool VPInstruction::isFPMathOp() const {
726   // Inspired by FPMathOperator::classof. Notable differences are that we don't
727   // support Call, PHI and Select opcodes here yet.
728   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
729          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
730          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
731          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
732 }
733 #endif
734 
735 void VPInstruction::execute(VPTransformState &State) {
736   assert(!State.Lane && "VPInstruction executing an Lane");
737   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
738   assert((hasFastMathFlags() == isFPMathOp() ||
739           getOpcode() == Instruction::Select) &&
740          "Recipe not a FPMathOp but has fast-math flags?");
741   if (hasFastMathFlags())
742     State.Builder.setFastMathFlags(getFastMathFlags());
743   State.setDebugLocFrom(getDebugLoc());
744   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
745                                    (vputils::onlyFirstLaneUsed(this) ||
746                                     isVectorToScalar() || isSingleScalar());
747   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
748   if (GeneratesPerAllLanes) {
749     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
750          Lane != NumLanes; ++Lane) {
751       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
752       assert(GeneratedValue && "generatePerLane must produce a value");
753       State.set(this, GeneratedValue, VPLane(Lane));
754     }
755     return;
756   }
757 
758   Value *GeneratedValue = generate(State);
759   if (!hasResult())
760     return;
761   assert(GeneratedValue && "generate must produce a value");
762   assert(
763       (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
764        State.VF.isScalar()) &&
765       "scalar value but not only first lane defined");
766   State.set(this, GeneratedValue,
767             /*IsScalar*/ GeneratesPerFirstLaneOnly);
768 }
769 
770 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
771   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
772   if (Instruction::isBinaryOp(getOpcode()))
773     return vputils::onlyFirstLaneUsed(this);
774 
775   switch (getOpcode()) {
776   default:
777     return false;
778   case Instruction::ICmp:
779   case Instruction::Select:
780   case Instruction::Or:
781   case VPInstruction::PtrAdd:
782     // TODO: Cover additional opcodes.
783     return vputils::onlyFirstLaneUsed(this);
784   case VPInstruction::ActiveLaneMask:
785   case VPInstruction::ExplicitVectorLength:
786   case VPInstruction::CalculateTripCountMinusVF:
787   case VPInstruction::CanonicalIVIncrementForPart:
788   case VPInstruction::BranchOnCount:
789   case VPInstruction::BranchOnCond:
790   case VPInstruction::ResumePhi:
791     return true;
792   };
793   llvm_unreachable("switch should return");
794 }
795 
796 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
797   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
798   if (Instruction::isBinaryOp(getOpcode()))
799     return vputils::onlyFirstPartUsed(this);
800 
801   switch (getOpcode()) {
802   default:
803     return false;
804   case Instruction::ICmp:
805   case Instruction::Select:
806     return vputils::onlyFirstPartUsed(this);
807   case VPInstruction::BranchOnCount:
808   case VPInstruction::BranchOnCond:
809   case VPInstruction::CanonicalIVIncrementForPart:
810     return true;
811   };
812   llvm_unreachable("switch should return");
813 }
814 
815 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
816 void VPInstruction::dump() const {
817   VPSlotTracker SlotTracker(getParent()->getPlan());
818   print(dbgs(), "", SlotTracker);
819 }
820 
821 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
822                           VPSlotTracker &SlotTracker) const {
823   O << Indent << "EMIT ";
824 
825   if (hasResult()) {
826     printAsOperand(O, SlotTracker);
827     O << " = ";
828   }
829 
830   switch (getOpcode()) {
831   case VPInstruction::Not:
832     O << "not";
833     break;
834   case VPInstruction::SLPLoad:
835     O << "combined load";
836     break;
837   case VPInstruction::SLPStore:
838     O << "combined store";
839     break;
840   case VPInstruction::ActiveLaneMask:
841     O << "active lane mask";
842     break;
843   case VPInstruction::ResumePhi:
844     O << "resume-phi";
845     break;
846   case VPInstruction::ExplicitVectorLength:
847     O << "EXPLICIT-VECTOR-LENGTH";
848     break;
849   case VPInstruction::FirstOrderRecurrenceSplice:
850     O << "first-order splice";
851     break;
852   case VPInstruction::BranchOnCond:
853     O << "branch-on-cond";
854     break;
855   case VPInstruction::CalculateTripCountMinusVF:
856     O << "TC > VF ? TC - VF : 0";
857     break;
858   case VPInstruction::CanonicalIVIncrementForPart:
859     O << "VF * Part +";
860     break;
861   case VPInstruction::BranchOnCount:
862     O << "branch-on-count";
863     break;
864   case VPInstruction::ExtractFromEnd:
865     O << "extract-from-end";
866     break;
867   case VPInstruction::ComputeReductionResult:
868     O << "compute-reduction-result";
869     break;
870   case VPInstruction::LogicalAnd:
871     O << "logical-and";
872     break;
873   case VPInstruction::PtrAdd:
874     O << "ptradd";
875     break;
876   case VPInstruction::AnyOf:
877     O << "any-of";
878     break;
879   default:
880     O << Instruction::getOpcodeName(getOpcode());
881   }
882 
883   printFlags(O);
884   printOperands(O, SlotTracker);
885 
886   if (auto DL = getDebugLoc()) {
887     O << ", !dbg ";
888     DL.print(O);
889   }
890 }
891 #endif
892 
893 void VPIRInstruction::execute(VPTransformState &State) {
894   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
895          "Only PHINodes can have extra operands");
896   for (const auto &[Idx, Op] : enumerate(operands())) {
897     VPValue *ExitValue = Op;
898     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
899                     ? VPLane::getFirstLane()
900                     : VPLane::getLastLaneForVF(State.VF);
901     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
902     auto *PredVPBB = Pred->getExitingBasicBlock();
903     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
904     // Set insertion point in PredBB in case an extract needs to be generated.
905     // TODO: Model extracts explicitly.
906     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
907     Value *V = State.get(ExitValue, VPLane(Lane));
908     auto *Phi = cast<PHINode>(&I);
909     // If there is no existing block for PredBB in the phi, add a new incoming
910     // value. Otherwise update the existing incoming value for PredBB.
911     if (Phi->getBasicBlockIndex(PredBB) == -1)
912       Phi->addIncoming(V, PredBB);
913     else
914       Phi->setIncomingValueForBlock(PredBB, V);
915   }
916 
917   // Advance the insert point after the wrapped IR instruction. This allows
918   // interleaving VPIRInstructions and other recipes.
919   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
920 }
921 
922 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
923                                              VPCostContext &Ctx) const {
924   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
925   // hence it does not contribute to the cost-modeling for the VPlan.
926   return 0;
927 }
928 
929 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
930 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
931                             VPSlotTracker &SlotTracker) const {
932   O << Indent << "IR " << I;
933 
934   if (getNumOperands() != 0) {
935     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
936     interleaveComma(
937         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
938           Op.value()->printAsOperand(O, SlotTracker);
939           O << " from ";
940           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
941         });
942     O << ")";
943   }
944 }
945 #endif
946 
947 void VPWidenCallRecipe::execute(VPTransformState &State) {
948   assert(State.VF.isVector() && "not widening");
949   State.setDebugLocFrom(getDebugLoc());
950 
951   FunctionType *VFTy = Variant->getFunctionType();
952   // Add return type if intrinsic is overloaded on it.
953   SmallVector<Value *, 4> Args;
954   for (const auto &I : enumerate(arg_operands())) {
955     Value *Arg;
956     // Some vectorized function variants may also take a scalar argument,
957     // e.g. linear parameters for pointers. This needs to be the scalar value
958     // from the start of the respective part when interleaving.
959     if (!VFTy->getParamType(I.index())->isVectorTy())
960       Arg = State.get(I.value(), VPLane(0));
961     else
962       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
963     Args.push_back(Arg);
964   }
965 
966   assert(Variant != nullptr && "Can't create vector function.");
967 
968   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
969   SmallVector<OperandBundleDef, 1> OpBundles;
970   if (CI)
971     CI->getOperandBundlesAsDefs(OpBundles);
972 
973   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
974   setFlags(V);
975 
976   if (!V->getType()->isVoidTy())
977     State.set(this, V);
978   State.addMetadata(V, CI);
979 }
980 
981 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
982                                                VPCostContext &Ctx) const {
983   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
984   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
985                                   Variant->getFunctionType()->params(),
986                                   CostKind);
987 }
988 
989 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
990 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
991                               VPSlotTracker &SlotTracker) const {
992   O << Indent << "WIDEN-CALL ";
993 
994   Function *CalledFn = getCalledScalarFunction();
995   if (CalledFn->getReturnType()->isVoidTy())
996     O << "void ";
997   else {
998     printAsOperand(O, SlotTracker);
999     O << " = ";
1000   }
1001 
1002   O << "call";
1003   printFlags(O);
1004   O << " @" << CalledFn->getName() << "(";
1005   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1006     Op->printAsOperand(O, SlotTracker);
1007   });
1008   O << ")";
1009 
1010   O << " (using library function";
1011   if (Variant->hasName())
1012     O << ": " << Variant->getName();
1013   O << ")";
1014 }
1015 #endif
1016 
1017 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1018   assert(State.VF.isVector() && "not widening");
1019   State.setDebugLocFrom(getDebugLoc());
1020 
1021   SmallVector<Type *, 2> TysForDecl;
1022   // Add return type if intrinsic is overloaded on it.
1023   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1024     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1025   SmallVector<Value *, 4> Args;
1026   for (const auto &I : enumerate(operands())) {
1027     // Some intrinsics have a scalar argument - don't replace it with a
1028     // vector.
1029     Value *Arg;
1030     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1031                                            State.TTI))
1032       Arg = State.get(I.value(), VPLane(0));
1033     else
1034       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1035     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1036                                                State.TTI))
1037       TysForDecl.push_back(Arg->getType());
1038     Args.push_back(Arg);
1039   }
1040 
1041   // Use vector version of the intrinsic.
1042   Module *M = State.Builder.GetInsertBlock()->getModule();
1043   Function *VectorF =
1044       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1045   assert(VectorF &&
1046          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1047 
1048   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1049   SmallVector<OperandBundleDef, 1> OpBundles;
1050   if (CI)
1051     CI->getOperandBundlesAsDefs(OpBundles);
1052 
1053   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1054 
1055   setFlags(V);
1056 
1057   if (!V->getType()->isVoidTy())
1058     State.set(this, V);
1059   State.addMetadata(V, CI);
1060 }
1061 
1062 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1063                                                     VPCostContext &Ctx) const {
1064   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1065 
1066   // Some backends analyze intrinsic arguments to determine cost. Use the
1067   // underlying value for the operand if it has one. Otherwise try to use the
1068   // operand of the underlying call instruction, if there is one. Otherwise
1069   // clear Arguments.
1070   // TODO: Rework TTI interface to be independent of concrete IR values.
1071   SmallVector<const Value *> Arguments;
1072   for (const auto &[Idx, Op] : enumerate(operands())) {
1073     auto *V = Op->getUnderlyingValue();
1074     if (!V) {
1075       // Push all the VP Intrinsic's ops into the Argments even if is nullptr.
1076       // Some VP Intrinsic's cost will assert the number of parameters.
1077       // Mainly appears in the following two scenarios:
1078       // 1. EVL Op is nullptr
1079       // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic
1080       if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
1081         Arguments.push_back(V);
1082         continue;
1083       }
1084       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1085         Arguments.push_back(UI->getArgOperand(Idx));
1086         continue;
1087       }
1088       Arguments.clear();
1089       break;
1090     }
1091     Arguments.push_back(V);
1092   }
1093 
1094   Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1095   SmallVector<Type *> ParamTys;
1096   for (unsigned I = 0; I != getNumOperands(); ++I)
1097     ParamTys.push_back(
1098         toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1099 
1100   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1101   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1102   IntrinsicCostAttributes CostAttrs(
1103       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1104       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1105   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
1106 }
1107 
1108 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1109   return Intrinsic::getBaseName(VectorIntrinsicID);
1110 }
1111 
1112 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1113   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1114   // Vector predication intrinsics only demand the the first lane the last
1115   // operand (the EVL operand).
1116   return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1117          Op == getOperand(getNumOperands() - 1);
1118 }
1119 
1120 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1121 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1122                                    VPSlotTracker &SlotTracker) const {
1123   O << Indent << "WIDEN-INTRINSIC ";
1124   if (ResultTy->isVoidTy()) {
1125     O << "void ";
1126   } else {
1127     printAsOperand(O, SlotTracker);
1128     O << " = ";
1129   }
1130 
1131   O << "call";
1132   printFlags(O);
1133   O << getIntrinsicName() << "(";
1134 
1135   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1136     Op->printAsOperand(O, SlotTracker);
1137   });
1138   O << ")";
1139 }
1140 #endif
1141 
1142 void VPHistogramRecipe::execute(VPTransformState &State) {
1143   State.setDebugLocFrom(getDebugLoc());
1144   IRBuilderBase &Builder = State.Builder;
1145 
1146   Value *Address = State.get(getOperand(0));
1147   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1148   VectorType *VTy = cast<VectorType>(Address->getType());
1149 
1150   // The histogram intrinsic requires a mask even if the recipe doesn't;
1151   // if the mask operand was omitted then all lanes should be executed and
1152   // we just need to synthesize an all-true mask.
1153   Value *Mask = nullptr;
1154   if (VPValue *VPMask = getMask())
1155     Mask = State.get(VPMask);
1156   else
1157     Mask =
1158         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1159 
1160   // If this is a subtract, we want to invert the increment amount. We may
1161   // add a separate intrinsic in future, but for now we'll try this.
1162   if (Opcode == Instruction::Sub)
1163     IncAmt = Builder.CreateNeg(IncAmt);
1164   else
1165     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1166 
1167   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1168                                 {VTy, IncAmt->getType()},
1169                                 {Address, IncAmt, Mask});
1170 }
1171 
1172 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1173                                                VPCostContext &Ctx) const {
1174   // FIXME: Take the gather and scatter into account as well. For now we're
1175   //        generating the same cost as the fallback path, but we'll likely
1176   //        need to create a new TTI method for determining the cost, including
1177   //        whether we can use base + vec-of-smaller-indices or just
1178   //        vec-of-pointers.
1179   assert(VF.isVector() && "Invalid VF for histogram cost");
1180   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1181   VPValue *IncAmt = getOperand(1);
1182   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1183   VectorType *VTy = VectorType::get(IncTy, VF);
1184 
1185   // Assume that a non-constant update value (or a constant != 1) requires
1186   // a multiply, and add that into the cost.
1187   InstructionCost MulCost =
1188       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
1189   if (IncAmt->isLiveIn()) {
1190     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1191 
1192     if (CI && CI->getZExtValue() == 1)
1193       MulCost = TTI::TCC_Free;
1194   }
1195 
1196   // Find the cost of the histogram operation itself.
1197   Type *PtrTy = VectorType::get(AddressTy, VF);
1198   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1199   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1200                               Type::getVoidTy(Ctx.LLVMCtx),
1201                               {PtrTy, IncTy, MaskTy});
1202 
1203   // Add the costs together with the add/sub operation.
1204   return Ctx.TTI.getIntrinsicInstrCost(
1205              ICA, TargetTransformInfo::TCK_RecipThroughput) +
1206          MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
1207 }
1208 
1209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1210 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1211                               VPSlotTracker &SlotTracker) const {
1212   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1213   getOperand(0)->printAsOperand(O, SlotTracker);
1214 
1215   if (Opcode == Instruction::Sub)
1216     O << ", dec: ";
1217   else {
1218     assert(Opcode == Instruction::Add);
1219     O << ", inc: ";
1220   }
1221   getOperand(1)->printAsOperand(O, SlotTracker);
1222 
1223   if (VPValue *Mask = getMask()) {
1224     O << ", mask: ";
1225     Mask->printAsOperand(O, SlotTracker);
1226   }
1227 }
1228 
1229 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1230                                 VPSlotTracker &SlotTracker) const {
1231   O << Indent << "WIDEN-SELECT ";
1232   printAsOperand(O, SlotTracker);
1233   O << " = select ";
1234   getOperand(0)->printAsOperand(O, SlotTracker);
1235   O << ", ";
1236   getOperand(1)->printAsOperand(O, SlotTracker);
1237   O << ", ";
1238   getOperand(2)->printAsOperand(O, SlotTracker);
1239   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1240 }
1241 #endif
1242 
1243 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1244   State.setDebugLocFrom(getDebugLoc());
1245 
1246   // The condition can be loop invariant but still defined inside the
1247   // loop. This means that we can't just use the original 'cond' value.
1248   // We have to take the 'vectorized' value and pick the first lane.
1249   // Instcombine will make this a no-op.
1250   auto *InvarCond =
1251       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1252 
1253   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1254   Value *Op0 = State.get(getOperand(1));
1255   Value *Op1 = State.get(getOperand(2));
1256   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1257   State.set(this, Sel);
1258   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1259 }
1260 
1261 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1262                                                  VPCostContext &Ctx) const {
1263   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1264   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1265   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1266   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1267   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1268 
1269   VPValue *Op0, *Op1;
1270   using namespace llvm::VPlanPatternMatch;
1271   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1272       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1273        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1274     // select x, y, false --> x & y
1275     // select x, true, y --> x | y
1276     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1277     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1278 
1279     SmallVector<const Value *, 2> Operands;
1280     if (all_of(operands(),
1281                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1282       Operands.append(SI->op_begin(), SI->op_end());
1283     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1284     return Ctx.TTI.getArithmeticInstrCost(
1285         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind,
1286         {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1287   }
1288 
1289   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1290   if (!ScalarCond)
1291     CondTy = VectorType::get(CondTy, VF);
1292 
1293   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1294   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1295     Pred = Cmp->getPredicate();
1296   return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred,
1297                                     CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1298                                     {TTI::OK_AnyValue, TTI::OP_None}, SI);
1299 }
1300 
1301 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
1302     const FastMathFlags &FMF) {
1303   AllowReassoc = FMF.allowReassoc();
1304   NoNaNs = FMF.noNaNs();
1305   NoInfs = FMF.noInfs();
1306   NoSignedZeros = FMF.noSignedZeros();
1307   AllowReciprocal = FMF.allowReciprocal();
1308   AllowContract = FMF.allowContract();
1309   ApproxFunc = FMF.approxFunc();
1310 }
1311 
1312 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1313 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1314   switch (OpType) {
1315   case OperationType::Cmp:
1316     O << " " << CmpInst::getPredicateName(getPredicate());
1317     break;
1318   case OperationType::DisjointOp:
1319     if (DisjointFlags.IsDisjoint)
1320       O << " disjoint";
1321     break;
1322   case OperationType::PossiblyExactOp:
1323     if (ExactFlags.IsExact)
1324       O << " exact";
1325     break;
1326   case OperationType::OverflowingBinOp:
1327     if (WrapFlags.HasNUW)
1328       O << " nuw";
1329     if (WrapFlags.HasNSW)
1330       O << " nsw";
1331     break;
1332   case OperationType::FPMathOp:
1333     getFastMathFlags().print(O);
1334     break;
1335   case OperationType::GEPOp:
1336     if (GEPFlags.isInBounds())
1337       O << " inbounds";
1338     else if (GEPFlags.hasNoUnsignedSignedWrap())
1339       O << " nusw";
1340     if (GEPFlags.hasNoUnsignedWrap())
1341       O << " nuw";
1342     break;
1343   case OperationType::NonNegOp:
1344     if (NonNegFlags.NonNeg)
1345       O << " nneg";
1346     break;
1347   case OperationType::Other:
1348     break;
1349   }
1350   if (getNumOperands() > 0)
1351     O << " ";
1352 }
1353 #endif
1354 
1355 void VPWidenRecipe::execute(VPTransformState &State) {
1356   State.setDebugLocFrom(getDebugLoc());
1357   auto &Builder = State.Builder;
1358   switch (Opcode) {
1359   case Instruction::Call:
1360   case Instruction::Br:
1361   case Instruction::PHI:
1362   case Instruction::GetElementPtr:
1363   case Instruction::Select:
1364     llvm_unreachable("This instruction is handled by a different recipe.");
1365   case Instruction::UDiv:
1366   case Instruction::SDiv:
1367   case Instruction::SRem:
1368   case Instruction::URem:
1369   case Instruction::Add:
1370   case Instruction::FAdd:
1371   case Instruction::Sub:
1372   case Instruction::FSub:
1373   case Instruction::FNeg:
1374   case Instruction::Mul:
1375   case Instruction::FMul:
1376   case Instruction::FDiv:
1377   case Instruction::FRem:
1378   case Instruction::Shl:
1379   case Instruction::LShr:
1380   case Instruction::AShr:
1381   case Instruction::And:
1382   case Instruction::Or:
1383   case Instruction::Xor: {
1384     // Just widen unops and binops.
1385     SmallVector<Value *, 2> Ops;
1386     for (VPValue *VPOp : operands())
1387       Ops.push_back(State.get(VPOp));
1388 
1389     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1390 
1391     if (auto *VecOp = dyn_cast<Instruction>(V))
1392       setFlags(VecOp);
1393 
1394     // Use this vector value for all users of the original instruction.
1395     State.set(this, V);
1396     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1397     break;
1398   }
1399   case Instruction::Freeze: {
1400     Value *Op = State.get(getOperand(0));
1401 
1402     Value *Freeze = Builder.CreateFreeze(Op);
1403     State.set(this, Freeze);
1404     break;
1405   }
1406   case Instruction::ICmp:
1407   case Instruction::FCmp: {
1408     // Widen compares. Generate vector compares.
1409     bool FCmp = Opcode == Instruction::FCmp;
1410     Value *A = State.get(getOperand(0));
1411     Value *B = State.get(getOperand(1));
1412     Value *C = nullptr;
1413     if (FCmp) {
1414       // Propagate fast math flags.
1415       IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1416       if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1417         Builder.setFastMathFlags(I->getFastMathFlags());
1418       C = Builder.CreateFCmp(getPredicate(), A, B);
1419     } else {
1420       C = Builder.CreateICmp(getPredicate(), A, B);
1421     }
1422     State.set(this, C);
1423     State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1424     break;
1425   }
1426   default:
1427     // This instruction is not vectorized by simple widening.
1428     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1429                       << Instruction::getOpcodeName(Opcode));
1430     llvm_unreachable("Unhandled instruction!");
1431   } // end of switch.
1432 
1433 #if !defined(NDEBUG)
1434   // Verify that VPlan type inference results agree with the type of the
1435   // generated values.
1436   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1437              State.get(this)->getType() &&
1438          "inferred type and type from generated instructions do not match");
1439 #endif
1440 }
1441 
1442 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1443                                            VPCostContext &Ctx) const {
1444   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1445   switch (Opcode) {
1446   case Instruction::FNeg: {
1447     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1448     return Ctx.TTI.getArithmeticInstrCost(
1449         Opcode, VectorTy, CostKind,
1450         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1451         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1452   }
1453 
1454   case Instruction::UDiv:
1455   case Instruction::SDiv:
1456   case Instruction::SRem:
1457   case Instruction::URem:
1458     // More complex computation, let the legacy cost-model handle this for now.
1459     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1460   case Instruction::Add:
1461   case Instruction::FAdd:
1462   case Instruction::Sub:
1463   case Instruction::FSub:
1464   case Instruction::Mul:
1465   case Instruction::FMul:
1466   case Instruction::FDiv:
1467   case Instruction::FRem:
1468   case Instruction::Shl:
1469   case Instruction::LShr:
1470   case Instruction::AShr:
1471   case Instruction::And:
1472   case Instruction::Or:
1473   case Instruction::Xor: {
1474     VPValue *RHS = getOperand(1);
1475     // Certain instructions can be cheaper to vectorize if they have a constant
1476     // second vector operand. One example of this are shifts on x86.
1477     TargetTransformInfo::OperandValueInfo RHSInfo = {
1478         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1479     if (RHS->isLiveIn())
1480       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1481 
1482     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1483         getOperand(1)->isDefinedOutsideLoopRegions())
1484       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1485     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1486     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1487 
1488     SmallVector<const Value *, 4> Operands;
1489     if (CtxI)
1490       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1491     return Ctx.TTI.getArithmeticInstrCost(
1492         Opcode, VectorTy, CostKind,
1493         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1494         RHSInfo, Operands, CtxI, &Ctx.TLI);
1495   }
1496   case Instruction::Freeze: {
1497     // This opcode is unknown. Assume that it is the same as 'mul'.
1498     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1499     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1500   }
1501   case Instruction::ICmp:
1502   case Instruction::FCmp: {
1503     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1504     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1505     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1506                                       CostKind,
1507                                       {TTI::OK_AnyValue, TTI::OP_None},
1508                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1509   }
1510   default:
1511     llvm_unreachable("Unsupported opcode for instruction");
1512   }
1513 }
1514 
1515 void VPWidenEVLRecipe::execute(VPTransformState &State) {
1516   unsigned Opcode = getOpcode();
1517   // TODO: Support other opcodes
1518   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1519     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
1520 
1521   State.setDebugLocFrom(getDebugLoc());
1522 
1523   assert(State.get(getOperand(0))->getType()->isVectorTy() &&
1524          "VPWidenEVLRecipe should not be used for scalars");
1525 
1526   VPValue *EVL = getEVL();
1527   Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true);
1528   IRBuilderBase &BuilderIR = State.Builder;
1529   VectorBuilder Builder(BuilderIR);
1530   Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1531 
1532   SmallVector<Value *, 4> Ops;
1533   for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1534     VPValue *VPOp = getOperand(I);
1535     Ops.push_back(State.get(VPOp));
1536   }
1537 
1538   Builder.setMask(Mask).setEVL(EVLArg);
1539   Value *VPInst =
1540       Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
1541   // Currently vp-intrinsics only accept FMF flags.
1542   // TODO: Enable other flags when support is added.
1543   if (isa<FPMathOperator>(VPInst))
1544     setFlags(cast<Instruction>(VPInst));
1545 
1546   State.set(this, VPInst);
1547   State.addMetadata(VPInst,
1548                     dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1549 }
1550 
1551 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1552 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1553                           VPSlotTracker &SlotTracker) const {
1554   O << Indent << "WIDEN ";
1555   printAsOperand(O, SlotTracker);
1556   O << " = " << Instruction::getOpcodeName(Opcode);
1557   printFlags(O);
1558   printOperands(O, SlotTracker);
1559 }
1560 
1561 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1562                              VPSlotTracker &SlotTracker) const {
1563   O << Indent << "WIDEN ";
1564   printAsOperand(O, SlotTracker);
1565   O << " = vp." << Instruction::getOpcodeName(getOpcode());
1566   printFlags(O);
1567   printOperands(O, SlotTracker);
1568 }
1569 #endif
1570 
1571 void VPWidenCastRecipe::execute(VPTransformState &State) {
1572   State.setDebugLocFrom(getDebugLoc());
1573   auto &Builder = State.Builder;
1574   /// Vectorize casts.
1575   assert(State.VF.isVector() && "Not vectorizing?");
1576   Type *DestTy = VectorType::get(getResultType(), State.VF);
1577   VPValue *Op = getOperand(0);
1578   Value *A = State.get(Op);
1579   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1580   State.set(this, Cast);
1581   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1582   if (auto *CastOp = dyn_cast<Instruction>(Cast))
1583     setFlags(CastOp);
1584 }
1585 
1586 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1587                                                VPCostContext &Ctx) const {
1588   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
1589   // the legacy cost model, including truncates/extends when evaluating a
1590   // reduction in a smaller type.
1591   if (!getUnderlyingValue())
1592     return 0;
1593   // Computes the CastContextHint from a recipes that may access memory.
1594   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1595     if (VF.isScalar())
1596       return TTI::CastContextHint::Normal;
1597     if (isa<VPInterleaveRecipe>(R))
1598       return TTI::CastContextHint::Interleave;
1599     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
1600       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1601                                              : TTI::CastContextHint::Normal;
1602     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1603     if (WidenMemoryRecipe == nullptr)
1604       return TTI::CastContextHint::None;
1605     if (!WidenMemoryRecipe->isConsecutive())
1606       return TTI::CastContextHint::GatherScatter;
1607     if (WidenMemoryRecipe->isReverse())
1608       return TTI::CastContextHint::Reversed;
1609     if (WidenMemoryRecipe->isMasked())
1610       return TTI::CastContextHint::Masked;
1611     return TTI::CastContextHint::Normal;
1612   };
1613 
1614   VPValue *Operand = getOperand(0);
1615   TTI::CastContextHint CCH = TTI::CastContextHint::None;
1616   // For Trunc/FPTrunc, get the context from the only user.
1617   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
1618       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
1619     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
1620       CCH = ComputeCCH(StoreRecipe);
1621   }
1622   // For Z/Sext, get the context from the operand.
1623   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1624            Opcode == Instruction::FPExt) {
1625     if (Operand->isLiveIn())
1626       CCH = TTI::CastContextHint::Normal;
1627     else if (Operand->getDefiningRecipe())
1628       CCH = ComputeCCH(Operand->getDefiningRecipe());
1629   }
1630 
1631   auto *SrcTy =
1632       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
1633   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
1634   // Arm TTI will use the underlying instruction to determine the cost.
1635   return Ctx.TTI.getCastInstrCost(
1636       Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
1637       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1638 }
1639 
1640 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1641 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1642                               VPSlotTracker &SlotTracker) const {
1643   O << Indent << "WIDEN-CAST ";
1644   printAsOperand(O, SlotTracker);
1645   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1646   printFlags(O);
1647   printOperands(O, SlotTracker);
1648   O << " to " << *getResultType();
1649 }
1650 #endif
1651 
1652 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
1653                                                VPCostContext &Ctx) const {
1654   return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
1655 }
1656 
1657 /// This function adds
1658 /// (0 * Step, 1 * Step, 2 * Step, ...)
1659 /// to each vector element of Val.
1660 /// \p Opcode is relevant for FP induction variable.
1661 static Value *getStepVector(Value *Val, Value *Step,
1662                             Instruction::BinaryOps BinOp, ElementCount VF,
1663                             IRBuilderBase &Builder) {
1664   assert(VF.isVector() && "only vector VFs are supported");
1665 
1666   // Create and check the types.
1667   auto *ValVTy = cast<VectorType>(Val->getType());
1668   ElementCount VLen = ValVTy->getElementCount();
1669 
1670   Type *STy = Val->getType()->getScalarType();
1671   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1672          "Induction Step must be an integer or FP");
1673   assert(Step->getType() == STy && "Step has wrong type");
1674 
1675   SmallVector<Constant *, 8> Indices;
1676 
1677   // Create a vector of consecutive numbers from zero to VF.
1678   VectorType *InitVecValVTy = ValVTy;
1679   if (STy->isFloatingPointTy()) {
1680     Type *InitVecValSTy =
1681         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1682     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1683   }
1684   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1685 
1686   if (STy->isIntegerTy()) {
1687     Step = Builder.CreateVectorSplat(VLen, Step);
1688     assert(Step->getType() == Val->getType() && "Invalid step vec");
1689     // FIXME: The newly created binary instructions should contain nsw/nuw
1690     // flags, which can be found from the original scalar operations.
1691     Step = Builder.CreateMul(InitVec, Step);
1692     return Builder.CreateAdd(Val, Step, "induction");
1693   }
1694 
1695   // Floating point induction.
1696   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1697          "Binary Opcode should be specified for FP induction");
1698   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1699 
1700   Step = Builder.CreateVectorSplat(VLen, Step);
1701   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1702   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1703 }
1704 
1705 /// A helper function that returns an integer or floating-point constant with
1706 /// value C.
1707 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1708   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1709                            : ConstantFP::get(Ty, C);
1710 }
1711 
1712 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1713   assert(!State.Lane && "Int or FP induction being replicated.");
1714 
1715   Value *Start = getStartValue()->getLiveInIRValue();
1716   const InductionDescriptor &ID = getInductionDescriptor();
1717   TruncInst *Trunc = getTruncInst();
1718   IRBuilderBase &Builder = State.Builder;
1719   assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
1720          "Types must match");
1721   assert(State.VF.isVector() && "must have vector VF");
1722 
1723   // The value from the original loop to which we are mapping the new induction
1724   // variable.
1725   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
1726 
1727   // Fast-math-flags propagate from the original induction instruction.
1728   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1729   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1730     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1731 
1732   // Now do the actual transformations, and start with fetching the step value.
1733   Value *Step = State.get(getStepValue(), VPLane(0));
1734 
1735   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1736          "Expected either an induction phi-node or a truncate of it!");
1737 
1738   // Construct the initial value of the vector IV in the vector loop preheader
1739   auto CurrIP = Builder.saveIP();
1740   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1741   Builder.SetInsertPoint(VectorPH->getTerminator());
1742   if (isa<TruncInst>(EntryVal)) {
1743     assert(Start->getType()->isIntegerTy() &&
1744            "Truncation requires an integer type");
1745     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1746     Step = Builder.CreateTrunc(Step, TruncType);
1747     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1748   }
1749 
1750   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1751   Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
1752                                       State.VF, State.Builder);
1753 
1754   // We create vector phi nodes for both integer and floating-point induction
1755   // variables. Here, we determine the kind of arithmetic we will perform.
1756   Instruction::BinaryOps AddOp;
1757   Instruction::BinaryOps MulOp;
1758   if (Step->getType()->isIntegerTy()) {
1759     AddOp = Instruction::Add;
1760     MulOp = Instruction::Mul;
1761   } else {
1762     AddOp = ID.getInductionOpcode();
1763     MulOp = Instruction::FMul;
1764   }
1765 
1766   Value *SplatVF;
1767   if (VPValue *SplatVFOperand = getSplatVFValue()) {
1768     // The recipe has been unrolled. In that case, fetch the splat value for the
1769     // induction increment.
1770     SplatVF = State.get(SplatVFOperand);
1771   } else {
1772     // Multiply the vectorization factor by the step using integer or
1773     // floating-point arithmetic as appropriate.
1774     Type *StepType = Step->getType();
1775     Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
1776     if (Step->getType()->isFloatingPointTy())
1777       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
1778     else
1779       RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
1780     Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1781 
1782     // Create a vector splat to use in the induction update.
1783     SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
1784   }
1785 
1786   Builder.restoreIP(CurrIP);
1787 
1788   // We may need to add the step a number of times, depending on the unroll
1789   // factor. The last of those goes into the PHI.
1790   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1791   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1792   VecInd->setDebugLoc(getDebugLoc());
1793   State.set(this, VecInd);
1794 
1795   Instruction *LastInduction = cast<Instruction>(
1796       Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
1797   if (isa<TruncInst>(EntryVal))
1798     State.addMetadata(LastInduction, EntryVal);
1799   LastInduction->setDebugLoc(getDebugLoc());
1800 
1801   VecInd->addIncoming(SteppedStart, VectorPH);
1802   // Add induction update using an incorrect block temporarily. The phi node
1803   // will be fixed after VPlan execution. Note that at this point the latch
1804   // block cannot be used, as it does not exist yet.
1805   // TODO: Model increment value in VPlan, by turning the recipe into a
1806   // multi-def and a subclass of VPHeaderPHIRecipe.
1807   VecInd->addIncoming(LastInduction, VectorPH);
1808 }
1809 
1810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1811 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1812                                           VPSlotTracker &SlotTracker) const {
1813   O << Indent;
1814   printAsOperand(O, SlotTracker);
1815   O << " = WIDEN-INDUCTION  ";
1816   printOperands(O, SlotTracker);
1817 
1818   if (auto *TI = getTruncInst())
1819     O << " (truncated to " << *TI->getType() << ")";
1820 }
1821 #endif
1822 
1823 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1824   // The step may be defined by a recipe in the preheader (e.g. if it requires
1825   // SCEV expansion), but for the canonical induction the step is required to be
1826   // 1, which is represented as live-in.
1827   if (getStepValue()->getDefiningRecipe())
1828     return false;
1829   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1830   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1831   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1832   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1833          getScalarType() == CanIV->getScalarType();
1834 }
1835 
1836 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1837 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1838                               VPSlotTracker &SlotTracker) const {
1839   O << Indent;
1840   printAsOperand(O, SlotTracker);
1841   O << " = DERIVED-IV ";
1842   getStartValue()->printAsOperand(O, SlotTracker);
1843   O << " + ";
1844   getOperand(1)->printAsOperand(O, SlotTracker);
1845   O << " * ";
1846   getStepValue()->printAsOperand(O, SlotTracker);
1847 }
1848 #endif
1849 
1850 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1851   // Fast-math-flags propagate from the original induction instruction.
1852   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1853   if (hasFastMathFlags())
1854     State.Builder.setFastMathFlags(getFastMathFlags());
1855 
1856   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1857   /// variable on which to base the steps, \p Step is the size of the step.
1858 
1859   Value *BaseIV = State.get(getOperand(0), VPLane(0));
1860   Value *Step = State.get(getStepValue(), VPLane(0));
1861   IRBuilderBase &Builder = State.Builder;
1862 
1863   // Ensure step has the same type as that of scalar IV.
1864   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1865   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1866 
1867   // We build scalar steps for both integer and floating-point induction
1868   // variables. Here, we determine the kind of arithmetic we will perform.
1869   Instruction::BinaryOps AddOp;
1870   Instruction::BinaryOps MulOp;
1871   if (BaseIVTy->isIntegerTy()) {
1872     AddOp = Instruction::Add;
1873     MulOp = Instruction::Mul;
1874   } else {
1875     AddOp = InductionOpcode;
1876     MulOp = Instruction::FMul;
1877   }
1878 
1879   // Determine the number of scalars we need to generate for each unroll
1880   // iteration.
1881   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1882   // Compute the scalar steps and save the results in State.
1883   Type *IntStepTy =
1884       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1885   Type *VecIVTy = nullptr;
1886   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1887   if (!FirstLaneOnly && State.VF.isScalable()) {
1888     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1889     UnitStepVec =
1890         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1891     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1892     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1893   }
1894 
1895   unsigned StartLane = 0;
1896   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1897   if (State.Lane) {
1898     StartLane = State.Lane->getKnownLane();
1899     EndLane = StartLane + 1;
1900   }
1901   Value *StartIdx0 =
1902       createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
1903 
1904   if (!FirstLaneOnly && State.VF.isScalable()) {
1905     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1906     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1907     if (BaseIVTy->isFloatingPointTy())
1908       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1909     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1910     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1911     State.set(this, Add);
1912     // It's useful to record the lane values too for the known minimum number
1913     // of elements so we do those below. This improves the code quality when
1914     // trying to extract the first element, for example.
1915   }
1916 
1917   if (BaseIVTy->isFloatingPointTy())
1918     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1919 
1920   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1921     Value *StartIdx = Builder.CreateBinOp(
1922         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1923     // The step returned by `createStepForVF` is a runtime-evaluated value
1924     // when VF is scalable. Otherwise, it should be folded into a Constant.
1925     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1926            "Expected StartIdx to be folded to a constant when VF is not "
1927            "scalable");
1928     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1929     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1930     State.set(this, Add, VPLane(Lane));
1931   }
1932 }
1933 
1934 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1935 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1936                                   VPSlotTracker &SlotTracker) const {
1937   O << Indent;
1938   printAsOperand(O, SlotTracker);
1939   O << " = SCALAR-STEPS ";
1940   printOperands(O, SlotTracker);
1941 }
1942 #endif
1943 
1944 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1945   assert(State.VF.isVector() && "not widening");
1946   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1947   // Construct a vector GEP by widening the operands of the scalar GEP as
1948   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1949   // results in a vector of pointers when at least one operand of the GEP
1950   // is vector-typed. Thus, to keep the representation compact, we only use
1951   // vector-typed operands for loop-varying values.
1952 
1953   if (areAllOperandsInvariant()) {
1954     // If we are vectorizing, but the GEP has only loop-invariant operands,
1955     // the GEP we build (by only using vector-typed operands for
1956     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1957     // produce a vector of pointers, we need to either arbitrarily pick an
1958     // operand to broadcast, or broadcast a clone of the original GEP.
1959     // Here, we broadcast a clone of the original.
1960     //
1961     // TODO: If at some point we decide to scalarize instructions having
1962     //       loop-invariant operands, this special case will no longer be
1963     //       required. We would add the scalarization decision to
1964     //       collectLoopScalars() and teach getVectorValue() to broadcast
1965     //       the lane-zero scalar value.
1966     SmallVector<Value *> Ops;
1967     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1968       Ops.push_back(State.get(getOperand(I), VPLane(0)));
1969 
1970     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1971                                            ArrayRef(Ops).drop_front(), "",
1972                                            getGEPNoWrapFlags());
1973     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1974     State.set(this, Splat);
1975     State.addMetadata(Splat, GEP);
1976   } else {
1977     // If the GEP has at least one loop-varying operand, we are sure to
1978     // produce a vector of pointers unless VF is scalar.
1979     // The pointer operand of the new GEP. If it's loop-invariant, we
1980     // won't broadcast it.
1981     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
1982                                          : State.get(getOperand(0));
1983 
1984     // Collect all the indices for the new GEP. If any index is
1985     // loop-invariant, we won't broadcast it.
1986     SmallVector<Value *, 4> Indices;
1987     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1988       VPValue *Operand = getOperand(I);
1989       if (isIndexLoopInvariant(I - 1))
1990         Indices.push_back(State.get(Operand, VPLane(0)));
1991       else
1992         Indices.push_back(State.get(Operand));
1993     }
1994 
1995     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1996     // but it should be a vector, otherwise.
1997     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1998                                            Indices, "", getGEPNoWrapFlags());
1999     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2000            "NewGEP is not a pointer vector");
2001     State.set(this, NewGEP);
2002     State.addMetadata(NewGEP, GEP);
2003   }
2004 }
2005 
2006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2007 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2008                              VPSlotTracker &SlotTracker) const {
2009   O << Indent << "WIDEN-GEP ";
2010   O << (isPointerLoopInvariant() ? "Inv" : "Var");
2011   for (size_t I = 0; I < getNumOperands() - 1; ++I)
2012     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2013 
2014   O << " ";
2015   printAsOperand(O, SlotTracker);
2016   O << " = getelementptr";
2017   printFlags(O);
2018   printOperands(O, SlotTracker);
2019 }
2020 #endif
2021 
2022 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
2023                            unsigned CurrentPart, IRBuilderBase &Builder) {
2024   // Use i32 for the gep index type when the value is constant,
2025   // or query DataLayout for a more suitable index type otherwise.
2026   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2027   return IsScalable && (IsReverse || CurrentPart > 0)
2028              ? DL.getIndexType(Builder.getPtrTy(0))
2029              : Builder.getInt32Ty();
2030 }
2031 
2032 void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
2033   auto &Builder = State.Builder;
2034   State.setDebugLocFrom(getDebugLoc());
2035   unsigned CurrentPart = getUnrollPart(*this);
2036   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2037                                 CurrentPart, Builder);
2038 
2039   // The wide store needs to start at the last vector element.
2040   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2041   if (IndexTy != RunTimeVF->getType())
2042     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2043   // NumElt = -CurrentPart * RunTimeVF
2044   Value *NumElt = Builder.CreateMul(
2045       ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
2046   // LastLane = 1 - RunTimeVF
2047   Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
2048   Value *Ptr = State.get(getOperand(0), VPLane(0));
2049   Value *ResultPtr =
2050       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2051   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2052                                 getGEPNoWrapFlags());
2053 
2054   State.set(this, ResultPtr, /*IsScalar*/ true);
2055 }
2056 
2057 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2058 void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2059                                          VPSlotTracker &SlotTracker) const {
2060   O << Indent;
2061   printAsOperand(O, SlotTracker);
2062   O << " = reverse-vector-pointer";
2063   printFlags(O);
2064   O << " ";
2065   printOperands(O, SlotTracker);
2066 }
2067 #endif
2068 
2069 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2070   auto &Builder = State.Builder;
2071   State.setDebugLocFrom(getDebugLoc());
2072   unsigned CurrentPart = getUnrollPart(*this);
2073   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2074                                 CurrentPart, Builder);
2075   Value *Ptr = State.get(getOperand(0), VPLane(0));
2076 
2077   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2078   Value *ResultPtr =
2079       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2080 
2081   State.set(this, ResultPtr, /*IsScalar*/ true);
2082 }
2083 
2084 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2085 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2086                                   VPSlotTracker &SlotTracker) const {
2087   O << Indent;
2088   printAsOperand(O, SlotTracker);
2089   O << " = vector-pointer ";
2090 
2091   printOperands(O, SlotTracker);
2092 }
2093 #endif
2094 
2095 void VPBlendRecipe::execute(VPTransformState &State) {
2096   assert(isNormalized() && "Expected blend to be normalized!");
2097   State.setDebugLocFrom(getDebugLoc());
2098   // We know that all PHIs in non-header blocks are converted into
2099   // selects, so we don't have to worry about the insertion order and we
2100   // can just use the builder.
2101   // At this point we generate the predication tree. There may be
2102   // duplications since this is a simple recursive scan, but future
2103   // optimizations will clean it up.
2104 
2105   unsigned NumIncoming = getNumIncomingValues();
2106 
2107   // Generate a sequence of selects of the form:
2108   // SELECT(Mask3, In3,
2109   //        SELECT(Mask2, In2,
2110   //               SELECT(Mask1, In1,
2111   //                      In0)))
2112   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2113   // are essentially undef are taken from In0.
2114   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2115   Value *Result = nullptr;
2116   for (unsigned In = 0; In < NumIncoming; ++In) {
2117     // We might have single edge PHIs (blocks) - use an identity
2118     // 'select' for the first PHI operand.
2119     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2120     if (In == 0)
2121       Result = In0; // Initialize with the first incoming value.
2122     else {
2123       // Select between the current value and the previous incoming edge
2124       // based on the incoming mask.
2125       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2126       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2127     }
2128   }
2129   State.set(this, Result, OnlyFirstLaneUsed);
2130 }
2131 
2132 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2133                                            VPCostContext &Ctx) const {
2134   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2135 
2136   // Handle cases where only the first lane is used the same way as the legacy
2137   // cost model.
2138   if (vputils::onlyFirstLaneUsed(this))
2139     return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
2140 
2141   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2142   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2143   return (getNumIncomingValues() - 1) *
2144          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2145                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
2146 }
2147 
2148 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2149 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2150                           VPSlotTracker &SlotTracker) const {
2151   O << Indent << "BLEND ";
2152   printAsOperand(O, SlotTracker);
2153   O << " =";
2154   if (getNumIncomingValues() == 1) {
2155     // Not a User of any mask: not really blending, this is a
2156     // single-predecessor phi.
2157     O << " ";
2158     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2159   } else {
2160     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2161       O << " ";
2162       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2163       if (I == 0)
2164         continue;
2165       O << "/";
2166       getMask(I)->printAsOperand(O, SlotTracker);
2167     }
2168   }
2169 }
2170 #endif
2171 
2172 void VPReductionRecipe::execute(VPTransformState &State) {
2173   assert(!State.Lane && "Reduction being replicated.");
2174   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2175   RecurKind Kind = RdxDesc.getRecurrenceKind();
2176   // Propagate the fast-math flags carried by the underlying instruction.
2177   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2178   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2179   Value *NewVecOp = State.get(getVecOp());
2180   if (VPValue *Cond = getCondOp()) {
2181     Value *NewCond = State.get(Cond, State.VF.isScalar());
2182     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2183     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2184 
2185     Value *Start;
2186     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
2187       Start = RdxDesc.getRecurrenceStartValue();
2188     else
2189       Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
2190                                           RdxDesc.getFastMathFlags());
2191     if (State.VF.isVector())
2192       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2193 
2194     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2195     NewVecOp = Select;
2196   }
2197   Value *NewRed;
2198   Value *NextInChain;
2199   if (IsOrdered) {
2200     if (State.VF.isVector())
2201       NewRed =
2202           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
2203     else
2204       NewRed = State.Builder.CreateBinOp(
2205           (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp);
2206     PrevInChain = NewRed;
2207     NextInChain = NewRed;
2208   } else {
2209     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2210     NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
2211     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2212       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
2213                                    NewRed, PrevInChain);
2214     else
2215       NextInChain = State.Builder.CreateBinOp(
2216           (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain);
2217   }
2218   State.set(this, NextInChain, /*IsScalar*/ true);
2219 }
2220 
2221 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2222   assert(!State.Lane && "Reduction being replicated.");
2223 
2224   auto &Builder = State.Builder;
2225   // Propagate the fast-math flags carried by the underlying instruction.
2226   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2227   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2228   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
2229 
2230   RecurKind Kind = RdxDesc.getRecurrenceKind();
2231   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2232   Value *VecOp = State.get(getVecOp());
2233   Value *EVL = State.get(getEVL(), VPLane(0));
2234 
2235   VectorBuilder VBuilder(Builder);
2236   VBuilder.setEVL(EVL);
2237   Value *Mask;
2238   // TODO: move the all-true mask generation into VectorBuilder.
2239   if (VPValue *CondOp = getCondOp())
2240     Mask = State.get(CondOp);
2241   else
2242     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2243   VBuilder.setMask(Mask);
2244 
2245   Value *NewRed;
2246   if (isOrdered()) {
2247     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
2248   } else {
2249     NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
2250     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2251       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2252     else
2253       NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(),
2254                                    NewRed, Prev);
2255   }
2256   State.set(this, NewRed, /*IsScalar*/ true);
2257 }
2258 
2259 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2260                                                VPCostContext &Ctx) const {
2261   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
2262   Type *ElementTy = Ctx.Types.inferScalarType(this);
2263   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2264   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2265   unsigned Opcode = RdxDesc.getOpcode();
2266 
2267   // TODO: Support any-of and in-loop reductions.
2268   assert(
2269       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2270        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2271       "Any-of reduction not implemented in VPlan-based cost model currently.");
2272   assert(
2273       (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
2274        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2275       "In-loop reduction not implemented in VPlan-based cost model currently.");
2276 
2277   assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() &&
2278          "Inferred type and recurrence type mismatch.");
2279 
2280   // Cost = Reduction cost + BinOp cost
2281   InstructionCost Cost =
2282       Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
2283   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2284     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2285     return Cost + Ctx.TTI.getMinMaxReductionCost(
2286                       Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
2287   }
2288 
2289   return Cost + Ctx.TTI.getArithmeticReductionCost(
2290                     Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
2291 }
2292 
2293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2294 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2295                               VPSlotTracker &SlotTracker) const {
2296   O << Indent << "REDUCE ";
2297   printAsOperand(O, SlotTracker);
2298   O << " = ";
2299   getChainOp()->printAsOperand(O, SlotTracker);
2300   O << " +";
2301   if (isa<FPMathOperator>(getUnderlyingInstr()))
2302     O << getUnderlyingInstr()->getFastMathFlags();
2303   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2304   getVecOp()->printAsOperand(O, SlotTracker);
2305   if (isConditional()) {
2306     O << ", ";
2307     getCondOp()->printAsOperand(O, SlotTracker);
2308   }
2309   O << ")";
2310   if (RdxDesc.IntermediateStore)
2311     O << " (with final reduction value stored in invariant address sank "
2312          "outside of loop)";
2313 }
2314 
2315 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2316                                  VPSlotTracker &SlotTracker) const {
2317   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
2318   O << Indent << "REDUCE ";
2319   printAsOperand(O, SlotTracker);
2320   O << " = ";
2321   getChainOp()->printAsOperand(O, SlotTracker);
2322   O << " +";
2323   if (isa<FPMathOperator>(getUnderlyingInstr()))
2324     O << getUnderlyingInstr()->getFastMathFlags();
2325   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
2326   getVecOp()->printAsOperand(O, SlotTracker);
2327   O << ", ";
2328   getEVL()->printAsOperand(O, SlotTracker);
2329   if (isConditional()) {
2330     O << ", ";
2331     getCondOp()->printAsOperand(O, SlotTracker);
2332   }
2333   O << ")";
2334   if (RdxDesc.IntermediateStore)
2335     O << " (with final reduction value stored in invariant address sank "
2336          "outside of loop)";
2337 }
2338 #endif
2339 
2340 bool VPReplicateRecipe::shouldPack() const {
2341   // Find if the recipe is used by a widened recipe via an intervening
2342   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2343   return any_of(users(), [](const VPUser *U) {
2344     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2345       return any_of(PredR->users(), [PredR](const VPUser *U) {
2346         return !U->usesScalars(PredR);
2347       });
2348     return false;
2349   });
2350 }
2351 
2352 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2353                                                VPCostContext &Ctx) const {
2354   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2355   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2356   // transform, avoid computing their cost multiple times for now.
2357   Ctx.SkipCostComputation.insert(UI);
2358   return Ctx.getLegacyCost(UI, VF);
2359 }
2360 
2361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2362 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2363                               VPSlotTracker &SlotTracker) const {
2364   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
2365 
2366   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2367     printAsOperand(O, SlotTracker);
2368     O << " = ";
2369   }
2370   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2371     O << "call";
2372     printFlags(O);
2373     O << "@" << CB->getCalledFunction()->getName() << "(";
2374     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2375                     O, [&O, &SlotTracker](VPValue *Op) {
2376                       Op->printAsOperand(O, SlotTracker);
2377                     });
2378     O << ")";
2379   } else {
2380     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2381     printFlags(O);
2382     printOperands(O, SlotTracker);
2383   }
2384 
2385   if (shouldPack())
2386     O << " (S->V)";
2387 }
2388 #endif
2389 
2390 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
2391   assert(vputils::onlyFirstLaneUsed(this) &&
2392          "Codegen only implemented for first lane.");
2393   switch (Opcode) {
2394   case Instruction::SExt:
2395   case Instruction::ZExt:
2396   case Instruction::Trunc: {
2397     // Note: SExt/ZExt not used yet.
2398     Value *Op = State.get(getOperand(0), VPLane(0));
2399     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
2400   }
2401   default:
2402     llvm_unreachable("opcode not implemented yet");
2403   }
2404 }
2405 
2406 void VPScalarCastRecipe ::execute(VPTransformState &State) {
2407   State.set(this, generate(State), VPLane(0));
2408 }
2409 
2410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2411 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
2412                                 VPSlotTracker &SlotTracker) const {
2413   O << Indent << "SCALAR-CAST ";
2414   printAsOperand(O, SlotTracker);
2415   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
2416   printOperands(O, SlotTracker);
2417   O << " to " << *ResultTy;
2418 }
2419 #endif
2420 
2421 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2422   assert(State.Lane && "Branch on Mask works only on single instance.");
2423 
2424   unsigned Lane = State.Lane->getKnownLane();
2425 
2426   Value *ConditionBit = nullptr;
2427   VPValue *BlockInMask = getMask();
2428   if (BlockInMask) {
2429     ConditionBit = State.get(BlockInMask);
2430     if (ConditionBit->getType()->isVectorTy())
2431       ConditionBit = State.Builder.CreateExtractElement(
2432           ConditionBit, State.Builder.getInt32(Lane));
2433   } else // Block in mask is all-one.
2434     ConditionBit = State.Builder.getTrue();
2435 
2436   // Replace the temporary unreachable terminator with a new conditional branch,
2437   // whose two destinations will be set later when they are created.
2438   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2439   assert(isa<UnreachableInst>(CurrentTerminator) &&
2440          "Expected to replace unreachable terminator with conditional branch.");
2441   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
2442   CondBr->setSuccessor(0, nullptr);
2443   ReplaceInstWithInst(CurrentTerminator, CondBr);
2444 }
2445 
2446 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2447                                                   VPCostContext &Ctx) const {
2448   // The legacy cost model doesn't assign costs to branches for individual
2449   // replicate regions. Match the current behavior in the VPlan cost model for
2450   // now.
2451   return 0;
2452 }
2453 
2454 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
2455   State.setDebugLocFrom(getDebugLoc());
2456   assert(State.Lane && "Predicated instruction PHI works per instance.");
2457   Instruction *ScalarPredInst =
2458       cast<Instruction>(State.get(getOperand(0), *State.Lane));
2459   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
2460   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
2461   assert(PredicatingBB && "Predicated block has no single predecessor.");
2462   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
2463          "operand must be VPReplicateRecipe");
2464 
2465   // By current pack/unpack logic we need to generate only a single phi node: if
2466   // a vector value for the predicated instruction exists at this point it means
2467   // the instruction has vector users only, and a phi for the vector value is
2468   // needed. In this case the recipe of the predicated instruction is marked to
2469   // also do that packing, thereby "hoisting" the insert-element sequence.
2470   // Otherwise, a phi node for the scalar value is needed.
2471   if (State.hasVectorValue(getOperand(0))) {
2472     Value *VectorValue = State.get(getOperand(0));
2473     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
2474     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
2475     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
2476     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
2477     if (State.hasVectorValue(this))
2478       State.reset(this, VPhi);
2479     else
2480       State.set(this, VPhi);
2481     // NOTE: Currently we need to update the value of the operand, so the next
2482     // predicated iteration inserts its generated value in the correct vector.
2483     State.reset(getOperand(0), VPhi);
2484   } else {
2485     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
2486       return;
2487 
2488     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
2489     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
2490     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
2491                      PredicatingBB);
2492     Phi->addIncoming(ScalarPredInst, PredicatedBB);
2493     if (State.hasScalarValue(this, *State.Lane))
2494       State.reset(this, Phi, *State.Lane);
2495     else
2496       State.set(this, Phi, *State.Lane);
2497     // NOTE: Currently we need to update the value of the operand, so the next
2498     // predicated iteration inserts its generated value in the correct vector.
2499     State.reset(getOperand(0), Phi, *State.Lane);
2500   }
2501 }
2502 
2503 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2504 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2505                                 VPSlotTracker &SlotTracker) const {
2506   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2507   printAsOperand(O, SlotTracker);
2508   O << " = ";
2509   printOperands(O, SlotTracker);
2510 }
2511 #endif
2512 
2513 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
2514                                                  VPCostContext &Ctx) const {
2515   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2516   const Align Alignment =
2517       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2518   unsigned AS =
2519       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2520   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2521 
2522   if (!Consecutive) {
2523     // TODO: Using the original IR may not be accurate.
2524     // Currently, ARM will use the underlying IR to calculate gather/scatter
2525     // instruction cost.
2526     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
2527     assert(!Reverse &&
2528            "Inconsecutive memory access should not have the order.");
2529     return Ctx.TTI.getAddressComputationCost(Ty) +
2530            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
2531                                           IsMasked, Alignment, CostKind,
2532                                           &Ingredient);
2533   }
2534 
2535   InstructionCost Cost = 0;
2536   if (IsMasked) {
2537     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
2538                                           AS, CostKind);
2539   } else {
2540     TTI::OperandValueInfo OpInfo =
2541         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
2542     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
2543                                     CostKind, OpInfo, &Ingredient);
2544   }
2545   if (!Reverse)
2546     return Cost;
2547 
2548   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2549                                         cast<VectorType>(Ty), {}, CostKind, 0);
2550 }
2551 
2552 void VPWidenLoadRecipe::execute(VPTransformState &State) {
2553   auto *LI = cast<LoadInst>(&Ingredient);
2554 
2555   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2556   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2557   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2558   bool CreateGather = !isConsecutive();
2559 
2560   auto &Builder = State.Builder;
2561   State.setDebugLocFrom(getDebugLoc());
2562   Value *Mask = nullptr;
2563   if (auto *VPMask = getMask()) {
2564     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2565     // of a null all-one mask is a null mask.
2566     Mask = State.get(VPMask);
2567     if (isReverse())
2568       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2569   }
2570 
2571   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
2572   Value *NewLI;
2573   if (CreateGather) {
2574     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
2575                                        "wide.masked.gather");
2576   } else if (Mask) {
2577     NewLI =
2578         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
2579                                  PoisonValue::get(DataTy), "wide.masked.load");
2580   } else {
2581     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
2582   }
2583   // Add metadata to the load, but setVectorValue to the reverse shuffle.
2584   State.addMetadata(NewLI, LI);
2585   if (Reverse)
2586     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
2587   State.set(this, NewLI);
2588 }
2589 
2590 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2591 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2592                               VPSlotTracker &SlotTracker) const {
2593   O << Indent << "WIDEN ";
2594   printAsOperand(O, SlotTracker);
2595   O << " = load ";
2596   printOperands(O, SlotTracker);
2597 }
2598 #endif
2599 
2600 /// Use all-true mask for reverse rather than actual mask, as it avoids a
2601 /// dependence w/o affecting the result.
2602 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2603                                      Value *EVL, const Twine &Name) {
2604   VectorType *ValTy = cast<VectorType>(Operand->getType());
2605   Value *AllTrueMask =
2606       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2607   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2608                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
2609 }
2610 
2611 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
2612   auto *LI = cast<LoadInst>(&Ingredient);
2613 
2614   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
2615   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
2616   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2617   bool CreateGather = !isConsecutive();
2618 
2619   auto &Builder = State.Builder;
2620   State.setDebugLocFrom(getDebugLoc());
2621   CallInst *NewLI;
2622   Value *EVL = State.get(getEVL(), VPLane(0));
2623   Value *Addr = State.get(getAddr(), !CreateGather);
2624   Value *Mask = nullptr;
2625   if (VPValue *VPMask = getMask()) {
2626     Mask = State.get(VPMask);
2627     if (isReverse())
2628       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2629   } else {
2630     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2631   }
2632 
2633   if (CreateGather) {
2634     NewLI =
2635         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
2636                                 nullptr, "wide.masked.gather");
2637   } else {
2638     VectorBuilder VBuilder(Builder);
2639     VBuilder.setEVL(EVL).setMask(Mask);
2640     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2641         Instruction::Load, DataTy, Addr, "vp.op.load"));
2642   }
2643   NewLI->addParamAttr(
2644       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
2645   State.addMetadata(NewLI, LI);
2646   Instruction *Res = NewLI;
2647   if (isReverse())
2648     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2649   State.set(this, Res);
2650 }
2651 
2652 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2653                                                   VPCostContext &Ctx) const {
2654   if (!Consecutive || IsMasked)
2655     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2656 
2657   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2658   // here because the EVL recipes using EVL to replace the tail mask. But in the
2659   // legacy model, it will always calculate the cost of mask.
2660   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2661   // don't need to compare to the legacy cost model.
2662   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2663   const Align Alignment =
2664       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2665   unsigned AS =
2666       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2667   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2668   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2669       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2670   if (!Reverse)
2671     return Cost;
2672 
2673   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2674                                        cast<VectorType>(Ty), {}, CostKind, 0);
2675 }
2676 
2677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2678 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2679                                  VPSlotTracker &SlotTracker) const {
2680   O << Indent << "WIDEN ";
2681   printAsOperand(O, SlotTracker);
2682   O << " = vp.load ";
2683   printOperands(O, SlotTracker);
2684 }
2685 #endif
2686 
2687 void VPWidenStoreRecipe::execute(VPTransformState &State) {
2688   auto *SI = cast<StoreInst>(&Ingredient);
2689 
2690   VPValue *StoredVPValue = getStoredValue();
2691   bool CreateScatter = !isConsecutive();
2692   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2693 
2694   auto &Builder = State.Builder;
2695   State.setDebugLocFrom(getDebugLoc());
2696 
2697   Value *Mask = nullptr;
2698   if (auto *VPMask = getMask()) {
2699     // Mask reversal is only needed for non-all-one (null) masks, as reverse
2700     // of a null all-one mask is a null mask.
2701     Mask = State.get(VPMask);
2702     if (isReverse())
2703       Mask = Builder.CreateVectorReverse(Mask, "reverse");
2704   }
2705 
2706   Value *StoredVal = State.get(StoredVPValue);
2707   if (isReverse()) {
2708     // If we store to reverse consecutive memory locations, then we need
2709     // to reverse the order of elements in the stored value.
2710     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
2711     // We don't want to update the value in the map as it might be used in
2712     // another expression. So don't call resetVectorValue(StoredVal).
2713   }
2714   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
2715   Instruction *NewSI = nullptr;
2716   if (CreateScatter)
2717     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
2718   else if (Mask)
2719     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
2720   else
2721     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
2722   State.addMetadata(NewSI, SI);
2723 }
2724 
2725 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2726 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2727                                VPSlotTracker &SlotTracker) const {
2728   O << Indent << "WIDEN store ";
2729   printOperands(O, SlotTracker);
2730 }
2731 #endif
2732 
2733 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
2734   auto *SI = cast<StoreInst>(&Ingredient);
2735 
2736   VPValue *StoredValue = getStoredValue();
2737   bool CreateScatter = !isConsecutive();
2738   const Align Alignment = getLoadStoreAlignment(&Ingredient);
2739 
2740   auto &Builder = State.Builder;
2741   State.setDebugLocFrom(getDebugLoc());
2742 
2743   CallInst *NewSI = nullptr;
2744   Value *StoredVal = State.get(StoredValue);
2745   Value *EVL = State.get(getEVL(), VPLane(0));
2746   if (isReverse())
2747     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
2748   Value *Mask = nullptr;
2749   if (VPValue *VPMask = getMask()) {
2750     Mask = State.get(VPMask);
2751     if (isReverse())
2752       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2753   } else {
2754     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2755   }
2756   Value *Addr = State.get(getAddr(), !CreateScatter);
2757   if (CreateScatter) {
2758     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
2759                                     Intrinsic::vp_scatter,
2760                                     {StoredVal, Addr, Mask, EVL});
2761   } else {
2762     VectorBuilder VBuilder(Builder);
2763     VBuilder.setEVL(EVL).setMask(Mask);
2764     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2765         Instruction::Store, Type::getVoidTy(EVL->getContext()),
2766         {StoredVal, Addr}));
2767   }
2768   NewSI->addParamAttr(
2769       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
2770   State.addMetadata(NewSI, SI);
2771 }
2772 
2773 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2774                                                    VPCostContext &Ctx) const {
2775   if (!Consecutive || IsMasked)
2776     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2777 
2778   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2779   // here because the EVL recipes using EVL to replace the tail mask. But in the
2780   // legacy model, it will always calculate the cost of mask.
2781   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2782   // don't need to compare to the legacy cost model.
2783   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
2784   const Align Alignment =
2785       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2786   unsigned AS =
2787       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2788   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2789   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2790       Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2791   if (!Reverse)
2792     return Cost;
2793 
2794   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2795                                        cast<VectorType>(Ty), {}, CostKind, 0);
2796 }
2797 
2798 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2799 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2800                                   VPSlotTracker &SlotTracker) const {
2801   O << Indent << "WIDEN vp.store ";
2802   printOperands(O, SlotTracker);
2803 }
2804 #endif
2805 
2806 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2807                                      VectorType *DstVTy, const DataLayout &DL) {
2808   // Verify that V is a vector type with same number of elements as DstVTy.
2809   auto VF = DstVTy->getElementCount();
2810   auto *SrcVecTy = cast<VectorType>(V->getType());
2811   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2812   Type *SrcElemTy = SrcVecTy->getElementType();
2813   Type *DstElemTy = DstVTy->getElementType();
2814   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2815          "Vector elements must have same size");
2816 
2817   // Do a direct cast if element types are castable.
2818   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2819     return Builder.CreateBitOrPointerCast(V, DstVTy);
2820   }
2821   // V cannot be directly casted to desired vector type.
2822   // May happen when V is a floating point vector but DstVTy is a vector of
2823   // pointers or vice-versa. Handle this using a two-step bitcast using an
2824   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2825   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2826          "Only one type should be a pointer type");
2827   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2828          "Only one type should be a floating point type");
2829   Type *IntTy =
2830       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2831   auto *VecIntTy = VectorType::get(IntTy, VF);
2832   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2833   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2834 }
2835 
2836 /// Return a vector containing interleaved elements from multiple
2837 /// smaller input vectors.
2838 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2839                                 const Twine &Name) {
2840   unsigned Factor = Vals.size();
2841   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2842 
2843   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2844 #ifndef NDEBUG
2845   for (Value *Val : Vals)
2846     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2847 #endif
2848 
2849   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2850   // must use intrinsics to interleave.
2851   if (VecTy->isScalableTy()) {
2852     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2853     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2854                                    Vals,
2855                                    /*FMFSource=*/nullptr, Name);
2856   }
2857 
2858   // Fixed length. Start by concatenating all vectors into a wide vector.
2859   Value *WideVec = concatenateVectors(Builder, Vals);
2860 
2861   // Interleave the elements into the wide vector.
2862   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2863   return Builder.CreateShuffleVector(
2864       WideVec, createInterleaveMask(NumElts, Factor), Name);
2865 }
2866 
2867 // Try to vectorize the interleave group that \p Instr belongs to.
2868 //
2869 // E.g. Translate following interleaved load group (factor = 3):
2870 //   for (i = 0; i < N; i+=3) {
2871 //     R = Pic[i];             // Member of index 0
2872 //     G = Pic[i+1];           // Member of index 1
2873 //     B = Pic[i+2];           // Member of index 2
2874 //     ... // do something to R, G, B
2875 //   }
2876 // To:
2877 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2878 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2879 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2880 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2881 //
2882 // Or translate following interleaved store group (factor = 3):
2883 //   for (i = 0; i < N; i+=3) {
2884 //     ... do something to R, G, B
2885 //     Pic[i]   = R;           // Member of index 0
2886 //     Pic[i+1] = G;           // Member of index 1
2887 //     Pic[i+2] = B;           // Member of index 2
2888 //   }
2889 // To:
2890 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2891 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2892 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2893 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2894 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2895 void VPInterleaveRecipe::execute(VPTransformState &State) {
2896   assert(!State.Lane && "Interleave group being replicated.");
2897   const InterleaveGroup<Instruction> *Group = IG;
2898   Instruction *Instr = Group->getInsertPos();
2899 
2900   // Prepare for the vector type of the interleaved load/store.
2901   Type *ScalarTy = getLoadStoreType(Instr);
2902   unsigned InterleaveFactor = Group->getFactor();
2903   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2904 
2905   // TODO: extend the masked interleaved-group support to reversed access.
2906   VPValue *BlockInMask = getMask();
2907   assert((!BlockInMask || !Group->isReverse()) &&
2908          "Reversed masked interleave-group not supported.");
2909 
2910   VPValue *Addr = getAddr();
2911   Value *ResAddr = State.get(Addr, VPLane(0));
2912   if (auto *I = dyn_cast<Instruction>(ResAddr))
2913     State.setDebugLocFrom(I->getDebugLoc());
2914 
2915   // If the group is reverse, adjust the index to refer to the last vector lane
2916   // instead of the first. We adjust the index from the first vector lane,
2917   // rather than directly getting the pointer for lane VF - 1, because the
2918   // pointer operand of the interleaved access is supposed to be uniform.
2919   if (Group->isReverse()) {
2920     Value *RuntimeVF =
2921         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2922     Value *Index =
2923         State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2924     Index = State.Builder.CreateMul(Index,
2925                                     State.Builder.getInt32(Group->getFactor()));
2926     Index = State.Builder.CreateNeg(Index);
2927 
2928     bool InBounds = false;
2929     if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
2930       InBounds = Gep->isInBounds();
2931     ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
2932   }
2933 
2934   State.setDebugLocFrom(Instr->getDebugLoc());
2935   Value *PoisonVec = PoisonValue::get(VecTy);
2936 
2937   auto CreateGroupMask = [&BlockInMask, &State,
2938                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
2939     if (State.VF.isScalable()) {
2940       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2941       assert(InterleaveFactor == 2 &&
2942              "Unsupported deinterleave factor for scalable vectors");
2943       auto *ResBlockInMask = State.get(BlockInMask);
2944       SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2945       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2946                                      State.VF.getKnownMinValue() * 2, true);
2947       return State.Builder.CreateIntrinsic(
2948           MaskTy, Intrinsic::vector_interleave2, Ops,
2949           /*FMFSource=*/nullptr, "interleaved.mask");
2950     }
2951 
2952     if (!BlockInMask)
2953       return MaskForGaps;
2954 
2955     Value *ResBlockInMask = State.get(BlockInMask);
2956     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2957         ResBlockInMask,
2958         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2959         "interleaved.mask");
2960     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2961                                                    ShuffledMask, MaskForGaps)
2962                        : ShuffledMask;
2963   };
2964 
2965   const DataLayout &DL = Instr->getDataLayout();
2966   // Vectorize the interleaved load group.
2967   if (isa<LoadInst>(Instr)) {
2968     Value *MaskForGaps = nullptr;
2969     if (NeedsMaskForGaps) {
2970       MaskForGaps = createBitMaskForGaps(State.Builder,
2971                                          State.VF.getKnownMinValue(), *Group);
2972       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2973     }
2974 
2975     Instruction *NewLoad;
2976     if (BlockInMask || MaskForGaps) {
2977       Value *GroupMask = CreateGroupMask(MaskForGaps);
2978       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
2979                                                Group->getAlign(), GroupMask,
2980                                                PoisonVec, "wide.masked.vec");
2981     } else
2982       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
2983                                                 Group->getAlign(), "wide.vec");
2984     Group->addMetadata(NewLoad);
2985 
2986     ArrayRef<VPValue *> VPDefs = definedValues();
2987     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2988     if (VecTy->isScalableTy()) {
2989       assert(InterleaveFactor == 2 &&
2990              "Unsupported deinterleave factor for scalable vectors");
2991 
2992         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2993         // so must use intrinsics to deinterleave.
2994       Value *DI = State.Builder.CreateIntrinsic(
2995           Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2996           /*FMFSource=*/nullptr, "strided.vec");
2997       unsigned J = 0;
2998       for (unsigned I = 0; I < InterleaveFactor; ++I) {
2999         Instruction *Member = Group->getMember(I);
3000 
3001         if (!Member)
3002           continue;
3003 
3004         Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
3005         // If this member has different type, cast the result type.
3006         if (Member->getType() != ScalarTy) {
3007           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3008           StridedVec =
3009               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3010         }
3011 
3012         if (Group->isReverse())
3013           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3014 
3015         State.set(VPDefs[J], StridedVec);
3016         ++J;
3017       }
3018 
3019       return;
3020     }
3021 
3022     // For each member in the group, shuffle out the appropriate data from the
3023     // wide loads.
3024     unsigned J = 0;
3025     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3026       Instruction *Member = Group->getMember(I);
3027 
3028       // Skip the gaps in the group.
3029       if (!Member)
3030         continue;
3031 
3032       auto StrideMask =
3033           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
3034       Value *StridedVec =
3035           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3036 
3037       // If this member has different type, cast the result type.
3038       if (Member->getType() != ScalarTy) {
3039         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3040         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3041         StridedVec =
3042             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3043       }
3044 
3045       if (Group->isReverse())
3046         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3047 
3048       State.set(VPDefs[J], StridedVec);
3049       ++J;
3050     }
3051     return;
3052   }
3053 
3054   // The sub vector type for current instruction.
3055   auto *SubVT = VectorType::get(ScalarTy, State.VF);
3056 
3057   // Vectorize the interleaved store group.
3058   Value *MaskForGaps =
3059       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3060   assert((!MaskForGaps || !State.VF.isScalable()) &&
3061          "masking gaps for scalable vectors is not yet supported.");
3062   ArrayRef<VPValue *> StoredValues = getStoredValues();
3063   // Collect the stored vector from each member.
3064   SmallVector<Value *, 4> StoredVecs;
3065   unsigned StoredIdx = 0;
3066   for (unsigned i = 0; i < InterleaveFactor; i++) {
3067     assert((Group->getMember(i) || MaskForGaps) &&
3068            "Fail to get a member from an interleaved store group");
3069     Instruction *Member = Group->getMember(i);
3070 
3071     // Skip the gaps in the group.
3072     if (!Member) {
3073       Value *Undef = PoisonValue::get(SubVT);
3074       StoredVecs.push_back(Undef);
3075       continue;
3076     }
3077 
3078     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3079     ++StoredIdx;
3080 
3081     if (Group->isReverse())
3082       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3083 
3084     // If this member has different type, cast it to a unified type.
3085 
3086     if (StoredVec->getType() != SubVT)
3087       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3088 
3089     StoredVecs.push_back(StoredVec);
3090   }
3091 
3092   // Interleave all the smaller vectors into one wider vector.
3093   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3094   Instruction *NewStoreInstr;
3095   if (BlockInMask || MaskForGaps) {
3096     Value *GroupMask = CreateGroupMask(MaskForGaps);
3097     NewStoreInstr = State.Builder.CreateMaskedStore(
3098         IVec, ResAddr, Group->getAlign(), GroupMask);
3099   } else
3100     NewStoreInstr =
3101         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3102 
3103   Group->addMetadata(NewStoreInstr);
3104 }
3105 
3106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3107 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3108                                VPSlotTracker &SlotTracker) const {
3109   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3110   IG->getInsertPos()->printAsOperand(O, false);
3111   O << ", ";
3112   getAddr()->printAsOperand(O, SlotTracker);
3113   VPValue *Mask = getMask();
3114   if (Mask) {
3115     O << ", ";
3116     Mask->printAsOperand(O, SlotTracker);
3117   }
3118 
3119   unsigned OpIdx = 0;
3120   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3121     if (!IG->getMember(i))
3122       continue;
3123     if (getNumStoreOperands() > 0) {
3124       O << "\n" << Indent << "  store ";
3125       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3126       O << " to index " << i;
3127     } else {
3128       O << "\n" << Indent << "  ";
3129       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3130       O << " = load from index " << i;
3131     }
3132     ++OpIdx;
3133   }
3134 }
3135 #endif
3136 
3137 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3138                                                 VPCostContext &Ctx) const {
3139   Instruction *InsertPos = getInsertPos();
3140   // Find the VPValue index of the interleave group. We need to skip gaps.
3141   unsigned InsertPosIdx = 0;
3142   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3143     if (auto *Member = IG->getMember(Idx)) {
3144       if (Member == InsertPos)
3145         break;
3146       InsertPosIdx++;
3147     }
3148   Type *ValTy = Ctx.Types.inferScalarType(
3149       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3150                                 : getStoredValues()[InsertPosIdx]);
3151   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3152   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3153   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3154 
3155   unsigned InterleaveFactor = IG->getFactor();
3156   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3157 
3158   // Holds the indices of existing members in the interleaved group.
3159   SmallVector<unsigned, 4> Indices;
3160   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3161     if (IG->getMember(IF))
3162       Indices.push_back(IF);
3163 
3164   // Calculate the cost of the whole interleaved group.
3165   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3166       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3167       IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps);
3168 
3169   if (!IG->isReverse())
3170     return Cost;
3171 
3172   return Cost + IG->getNumMembers() *
3173                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3174                                            VectorTy, std::nullopt, CostKind, 0);
3175 }
3176 
3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3178 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3179                                    VPSlotTracker &SlotTracker) const {
3180   O << Indent << "EMIT ";
3181   printAsOperand(O, SlotTracker);
3182   O << " = CANONICAL-INDUCTION ";
3183   printOperands(O, SlotTracker);
3184 }
3185 #endif
3186 
3187 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3188   return IsScalarAfterVectorization &&
3189          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3190 }
3191 
3192 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3193   assert(getInductionDescriptor().getKind() ==
3194              InductionDescriptor::IK_PtrInduction &&
3195          "Not a pointer induction according to InductionDescriptor!");
3196   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
3197          "Unexpected type.");
3198   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3199          "Recipe should have been replaced");
3200 
3201   unsigned CurrentPart = getUnrollPart(*this);
3202 
3203   // Build a pointer phi
3204   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3205   Type *ScStValueType = ScalarStartValue->getType();
3206 
3207   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3208   PHINode *NewPointerPhi = nullptr;
3209   if (CurrentPart == 0) {
3210     auto *IVR = cast<VPHeaderPHIRecipe>(&getParent()
3211                                              ->getPlan()
3212                                              ->getVectorLoopRegion()
3213                                              ->getEntryBasicBlock()
3214                                              ->front());
3215     PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
3216     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
3217                                     CanonicalIV->getIterator());
3218     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3219     NewPointerPhi->setDebugLoc(getDebugLoc());
3220   } else {
3221     // The recipe has been unrolled. In that case, fetch the single pointer phi
3222     // shared among all unrolled parts of the recipe.
3223     auto *GEP =
3224         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3225     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3226   }
3227 
3228   // A pointer induction, performed by using a gep
3229   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3230   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3231   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3232   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3233   // Add induction update using an incorrect block temporarily. The phi node
3234   // will be fixed after VPlan execution. Note that at this point the latch
3235   // block cannot be used, as it does not exist yet.
3236   // TODO: Model increment value in VPlan, by turning the recipe into a
3237   // multi-def and a subclass of VPHeaderPHIRecipe.
3238   if (CurrentPart == 0) {
3239     // The recipe represents the first part of the pointer induction. Create the
3240     // GEP to increment the phi across all unrolled parts.
3241     unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
3242     Value *NumUnrolledElems =
3243         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
3244 
3245     Value *InductionGEP = GetElementPtrInst::Create(
3246         State.Builder.getInt8Ty(), NewPointerPhi,
3247         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
3248         InductionLoc);
3249 
3250     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3251   }
3252 
3253   // Create actual address geps that use the pointer phi as base and a
3254   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3255   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3256   Value *StartOffsetScalar = State.Builder.CreateMul(
3257       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3258   Value *StartOffset =
3259       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3260   // Create a vector of consecutive numbers from zero to VF.
3261   StartOffset = State.Builder.CreateAdd(
3262       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3263 
3264   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3265          "scalar step must be the same across all parts");
3266   Value *GEP = State.Builder.CreateGEP(
3267       State.Builder.getInt8Ty(), NewPointerPhi,
3268       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3269                                                State.VF, ScalarStepValue)),
3270       "vector.gep");
3271   State.set(this, GEP);
3272 }
3273 
3274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3275 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3276                                           VPSlotTracker &SlotTracker) const {
3277   assert((getNumOperands() == 2 || getNumOperands() == 4) &&
3278          "unexpected number of operands");
3279   O << Indent << "EMIT ";
3280   printAsOperand(O, SlotTracker);
3281   O << " = WIDEN-POINTER-INDUCTION ";
3282   getStartValue()->printAsOperand(O, SlotTracker);
3283   O << ", ";
3284   getStepValue()->printAsOperand(O, SlotTracker);
3285   if (getNumOperands() == 4) {
3286     O << ", ";
3287     getOperand(2)->printAsOperand(O, SlotTracker);
3288     O << ", ";
3289     getOperand(3)->printAsOperand(O, SlotTracker);
3290   }
3291 }
3292 #endif
3293 
3294 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3295   assert(!State.Lane && "cannot be used in per-lane");
3296   if (State.ExpandedSCEVs.contains(Expr)) {
3297     // SCEV Expr has already been expanded, result must already be set. At the
3298     // moment we have to execute the entry block twice (once before skeleton
3299     // creation to get expanded SCEVs used by the skeleton and once during
3300     // regular VPlan execution).
3301     State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
3302     assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
3303            "Results must match");
3304     return;
3305   }
3306 
3307   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3308   SCEVExpander Exp(SE, DL, "induction");
3309 
3310   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3311                                  &*State.Builder.GetInsertPoint());
3312   State.ExpandedSCEVs[Expr] = Res;
3313   State.set(this, Res, VPLane(0));
3314 }
3315 
3316 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3317 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3318                                VPSlotTracker &SlotTracker) const {
3319   O << Indent << "EMIT ";
3320   printAsOperand(O, SlotTracker);
3321   O << " = EXPAND SCEV " << *Expr;
3322 }
3323 #endif
3324 
3325 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3326   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3327   Type *STy = CanonicalIV->getType();
3328   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3329   ElementCount VF = State.VF;
3330   Value *VStart = VF.isScalar()
3331                       ? CanonicalIV
3332                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3333   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3334   if (VF.isVector()) {
3335     VStep = Builder.CreateVectorSplat(VF, VStep);
3336     VStep =
3337         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3338   }
3339   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3340   State.set(this, CanonicalVectorIV);
3341 }
3342 
3343 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3344 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3345                                      VPSlotTracker &SlotTracker) const {
3346   O << Indent << "EMIT ";
3347   printAsOperand(O, SlotTracker);
3348   O << " = WIDEN-CANONICAL-INDUCTION ";
3349   printOperands(O, SlotTracker);
3350 }
3351 #endif
3352 
3353 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3354   auto &Builder = State.Builder;
3355   // Create a vector from the initial value.
3356   auto *VectorInit = getStartValue()->getLiveInIRValue();
3357 
3358   Type *VecTy = State.VF.isScalar()
3359                     ? VectorInit->getType()
3360                     : VectorType::get(VectorInit->getType(), State.VF);
3361 
3362   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3363   if (State.VF.isVector()) {
3364     auto *IdxTy = Builder.getInt32Ty();
3365     auto *One = ConstantInt::get(IdxTy, 1);
3366     IRBuilder<>::InsertPointGuard Guard(Builder);
3367     Builder.SetInsertPoint(VectorPH->getTerminator());
3368     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3369     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3370     VectorInit = Builder.CreateInsertElement(
3371         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3372   }
3373 
3374   // Create a phi node for the new recurrence.
3375   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3376   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3377   Phi->addIncoming(VectorInit, VectorPH);
3378   State.set(this, Phi);
3379 }
3380 
3381 InstructionCost
3382 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3383                                              VPCostContext &Ctx) const {
3384   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3385   if (VF.isScalar())
3386   return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
3387 
3388   if (VF.isScalable() && VF.getKnownMinValue() == 1)
3389     return InstructionCost::getInvalid();
3390 
3391   SmallVector<int> Mask(VF.getKnownMinValue());
3392   std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
3393   Type *VectorTy =
3394       toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
3395 
3396   return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
3397                                 cast<VectorType>(VectorTy), Mask, CostKind,
3398                                 VF.getKnownMinValue() - 1);
3399 }
3400 
3401 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3402 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3403                                             VPSlotTracker &SlotTracker) const {
3404   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3405   printAsOperand(O, SlotTracker);
3406   O << " = phi ";
3407   printOperands(O, SlotTracker);
3408 }
3409 #endif
3410 
3411 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3412   auto &Builder = State.Builder;
3413 
3414   // If this phi is fed by a scaled reduction then it should output a
3415   // vector with fewer elements than the VF.
3416   ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
3417 
3418   // Reductions do not have to start at zero. They can start with
3419   // any loop invariant values.
3420   VPValue *StartVPV = getStartValue();
3421   Value *StartV = StartVPV->getLiveInIRValue();
3422 
3423   // In order to support recurrences we need to be able to vectorize Phi nodes.
3424   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3425   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3426   // this value when we vectorize all of the instructions that use the PHI.
3427   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3428   Type *VecTy =
3429       ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
3430 
3431   BasicBlock *HeaderBB = State.CFG.PrevBB;
3432   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
3433          "recipe must be in the vector loop header");
3434   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3435   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3436   State.set(this, Phi, IsInLoop);
3437 
3438   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3439 
3440   Value *Iden = nullptr;
3441   RecurKind RK = RdxDesc.getRecurrenceKind();
3442   unsigned CurrentPart = getUnrollPart(*this);
3443 
3444   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3445       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
3446     // MinMax and AnyOf reductions have the start value as their identity.
3447     if (ScalarPHI) {
3448       Iden = StartV;
3449     } else {
3450       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3451       Builder.SetInsertPoint(VectorPH->getTerminator());
3452       StartV = Iden = State.get(StartVPV);
3453     }
3454   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
3455     // [I|F]FindLastIV will use a sentinel value to initialize the reduction
3456     // phi or the resume value from the main vector loop when vectorizing the
3457     // epilogue loop. In the exit block, ComputeReductionResult will generate
3458     // checks to verify if the reduction result is the sentinel value. If the
3459     // result is the sentinel value, it will be corrected back to the start
3460     // value.
3461     // TODO: The sentinel value is not always necessary. When the start value is
3462     // a constant, and smaller than the start value of the induction variable,
3463     // the start value can be directly used to initialize the reduction phi.
3464     Iden = StartV;
3465     if (!ScalarPHI) {
3466       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3467       Builder.SetInsertPoint(VectorPH->getTerminator());
3468       StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
3469     }
3470   } else {
3471     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
3472                                        RdxDesc.getFastMathFlags());
3473 
3474     if (!ScalarPHI) {
3475       if (CurrentPart == 0) {
3476         // Create start and identity vector values for the reduction in the
3477         // preheader.
3478         // TODO: Introduce recipes in VPlan preheader to create initial values.
3479         Iden = Builder.CreateVectorSplat(VF, Iden);
3480         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
3481         Builder.SetInsertPoint(VectorPH->getTerminator());
3482         Constant *Zero = Builder.getInt32(0);
3483         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
3484       } else {
3485         Iden = Builder.CreateVectorSplat(VF, Iden);
3486       }
3487     }
3488   }
3489 
3490   Phi = cast<PHINode>(State.get(this, IsInLoop));
3491   Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3492   Phi->addIncoming(StartVal, VectorPH);
3493 }
3494 
3495 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3496 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3497                                  VPSlotTracker &SlotTracker) const {
3498   O << Indent << "WIDEN-REDUCTION-PHI ";
3499 
3500   printAsOperand(O, SlotTracker);
3501   O << " = phi ";
3502   printOperands(O, SlotTracker);
3503   if (VFScaleFactor != 1)
3504     O << " (VF scaled by 1/" << VFScaleFactor << ")";
3505 }
3506 #endif
3507 
3508 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3509   assert(EnableVPlanNativePath &&
3510          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
3511 
3512   Value *Op0 = State.get(getOperand(0));
3513   Type *VecTy = Op0->getType();
3514   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
3515   State.set(this, VecPhi);
3516 }
3517 
3518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3519 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3520                              VPSlotTracker &SlotTracker) const {
3521   O << Indent << "WIDEN-PHI ";
3522 
3523   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
3524   // Unless all incoming values are modeled in VPlan  print the original PHI
3525   // directly.
3526   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
3527   // values as VPValues.
3528   if (getNumOperands() != OriginalPhi->getNumOperands()) {
3529     O << VPlanIngredient(OriginalPhi);
3530     return;
3531   }
3532 
3533   printAsOperand(O, SlotTracker);
3534   O << " = phi ";
3535   printOperands(O, SlotTracker);
3536 }
3537 #endif
3538 
3539 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3540 // remove VPActiveLaneMaskPHIRecipe.
3541 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3542   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3543   Value *StartMask = State.get(getOperand(0));
3544   PHINode *Phi =
3545       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3546   Phi->addIncoming(StartMask, VectorPH);
3547   Phi->setDebugLoc(getDebugLoc());
3548   State.set(this, Phi);
3549 }
3550 
3551 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3552 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3553                                       VPSlotTracker &SlotTracker) const {
3554   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3555 
3556   printAsOperand(O, SlotTracker);
3557   O << " = phi ";
3558   printOperands(O, SlotTracker);
3559 }
3560 #endif
3561 
3562 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3563 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3564                                   VPSlotTracker &SlotTracker) const {
3565   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3566 
3567   printAsOperand(O, SlotTracker);
3568   O << " = phi ";
3569   printOperands(O, SlotTracker);
3570 }
3571 #endif
3572 
3573 void VPScalarPHIRecipe::execute(VPTransformState &State) {
3574   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
3575   Value *Start = State.get(getStartValue(), VPLane(0));
3576   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
3577   Phi->addIncoming(Start, VectorPH);
3578   Phi->setDebugLoc(getDebugLoc());
3579   State.set(this, Phi, /*IsScalar=*/true);
3580 }
3581 
3582 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3583 void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3584                               VPSlotTracker &SlotTracker) const {
3585   O << Indent << "SCALAR-PHI ";
3586   printAsOperand(O, SlotTracker);
3587   O << " = phi ";
3588   printOperands(O, SlotTracker);
3589 }
3590 #endif
3591